Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2-next] mptcp: add support for changing the backup flag
From: Davide Caratti @ 2021-12-09  9:10 UTC (permalink / raw)
  To: netdev, David Ahern, Stephen Hemminger, Andrea Claudi; +Cc: Matthieu Baerts

Linux supports 'MPTCP_PM_CMD_SET_FLAGS' since v5.12, and this control has
recently been extended to allow setting flags for a given endpoint id.
Although there is no use for changing 'signal' or 'subflow' flags, it can
be helpful to set/clear the backup bit on existing endpoints: add the 'ip
mptcp endpoint change <...>' command for this purpose.

Link: https://github.com/multipath-tcp/mptcp_net-next/issues/158
Acked-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 ip/ipmptcp.c        | 20 ++++++++++++++++----
 man/man8/ip-mptcp.8 | 14 ++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/ip/ipmptcp.c b/ip/ipmptcp.c
index 857004446aa3..7fb48a420a6b 100644
--- a/ip/ipmptcp.c
+++ b/ip/ipmptcp.c
@@ -25,6 +25,7 @@ static void usage(void)
 		"Usage:	ip mptcp endpoint add ADDRESS [ dev NAME ] [ id ID ]\n"
 		"				      [ port NR ] [ FLAG-LIST ]\n"
 		"	ip mptcp endpoint delete id ID\n"
+		"	ip mptcp endpoint change id ID [ backup | nobackup ]\n"
 		"	ip mptcp endpoint show [ id ID ]\n"
 		"	ip mptcp endpoint flush\n"
 		"	ip mptcp limits set [ subflows NR ] [ add_addr_accepted NR ]\n"
@@ -45,6 +46,8 @@ static int genl_family = -1;
 	GENL_REQUEST(_req, MPTCP_BUFLEN, genl_family, 0,	\
 		     MPTCP_PM_VER, _cmd, _flags)
 
+#define MPTCP_PM_ADDR_FLAG_NOBACKUP 0x0
+
 /* Mapping from argument to address flag mask */
 static const struct {
 	const char *name;
@@ -53,6 +56,7 @@ static const struct {
 	{ "signal",		MPTCP_PM_ADDR_FLAG_SIGNAL },
 	{ "subflow",		MPTCP_PM_ADDR_FLAG_SUBFLOW },
 	{ "backup",		MPTCP_PM_ADDR_FLAG_BACKUP },
+	{ "nobackup",		MPTCP_PM_ADDR_FLAG_NOBACKUP }
 };
 
 static void print_mptcp_addr_flags(unsigned int flags)
@@ -95,9 +99,9 @@ static int get_flags(const char *arg, __u32 *flags)
 	return -1;
 }
 
-static int mptcp_parse_opt(int argc, char **argv, struct nlmsghdr *n,
-			 bool adding)
+static int mptcp_parse_opt(int argc, char **argv, struct nlmsghdr *n, int cmd)
 {
+	bool adding = cmd == MPTCP_PM_CMD_ADD_ADDR;
 	struct rtattr *attr_addr;
 	bool addr_set = false;
 	inet_prefix address;
@@ -110,6 +114,11 @@ static int mptcp_parse_opt(int argc, char **argv, struct nlmsghdr *n,
 	ll_init_map(&rth);
 	while (argc > 0) {
 		if (get_flags(*argv, &flags) == 0) {
+			/* allow changing the 'backup' flag only */
+			if (cmd == MPTCP_PM_CMD_SET_FLAGS &&
+			    (flags & ~MPTCP_PM_ADDR_FLAG_BACKUP))
+				invarg("invalid flags\n", *argv);
+
 		} else if (matches(*argv, "id") == 0) {
 			NEXT_ARG();
 
@@ -182,7 +191,7 @@ static int mptcp_addr_modify(int argc, char **argv, int cmd)
 	MPTCP_REQUEST(req, cmd, NLM_F_REQUEST);
 	int ret;
 
-	ret = mptcp_parse_opt(argc, argv, &req.n, cmd == MPTCP_PM_CMD_ADD_ADDR);
+	ret = mptcp_parse_opt(argc, argv, &req.n, cmd);
 	if (ret)
 		return ret;
 
@@ -298,7 +307,7 @@ static int mptcp_addr_show(int argc, char **argv)
 	if (argc <= 0)
 		return mptcp_addr_dump();
 
-	ret = mptcp_parse_opt(argc, argv, &req.n, false);
+	ret = mptcp_parse_opt(argc, argv, &req.n, MPTCP_PM_CMD_GET_ADDR);
 	if (ret)
 		return ret;
 
@@ -534,6 +543,9 @@ int do_mptcp(int argc, char **argv)
 		if (matches(*argv, "add") == 0)
 			return mptcp_addr_modify(argc-1, argv+1,
 						 MPTCP_PM_CMD_ADD_ADDR);
+		if (matches(*argv, "change") == 0)
+			return mptcp_addr_modify(argc-1, argv+1,
+						 MPTCP_PM_CMD_SET_FLAGS);
 		if (matches(*argv, "delete") == 0)
 			return mptcp_addr_modify(argc-1, argv+1,
 						 MPTCP_PM_CMD_DEL_ADDR);
diff --git a/man/man8/ip-mptcp.8 b/man/man8/ip-mptcp.8
index 22335b6129ae..574cb60dc7c0 100644
--- a/man/man8/ip-mptcp.8
+++ b/man/man8/ip-mptcp.8
@@ -34,6 +34,13 @@ ip-mptcp \- MPTCP path manager configuration
 .BR "ip mptcp endpoint del id "
 .I ID
 
+.ti -8
+.BR "ip mptcp endpoint change id "
+.I ID
+.RB "[ "
+.I BACKUP-OPT
+.RB "] "
+
 .ti -8
 .BR "ip mptcp endpoint show "
 .RB "[ " id
@@ -55,6 +62,13 @@ ip-mptcp \- MPTCP path manager configuration
 .B backup
 .RB  "]"
 
+.ti -8
+.IR BACKUP-OPT " := ["
+.B backup
+.RB "|"
+.B nobackup
+.RB  "]"
+
 .ti -8
 .BR "ip mptcp limits set "
 .RB "[ "
-- 
2.31.1


^ permalink raw reply related

* [PATCH bpf-next 2/2] bpf: Introduce TCP_ULP option for bpf_{set,get}sockopt
From: Tony Lu @ 2021-12-09  9:02 UTC (permalink / raw)
  To: ast, daniel, andrii; +Cc: bpf, netdev
In-Reply-To: <20211209090250.73927-1-tonylu@linux.alibaba.com>

This introduces a new option TCP_ULP for bpf_{set,get}sockopt helper. It
helps prog to change TCP_ULP sockopt on demand.

People who want to set ULP based on strategies when socket create or
other's hook point, they can attach to BPF_CGROUP_INET_SOCK_CREATE or
other types, and judge based on user-defined rules to trigger
bpf_set_sockopt(.., IPPROTO_TCP, TCP_ULP) and set socket ULP. For
example, the bpf prog can control which socket should use tls ULP
modules without intrusively modifying the applications.

With this, it makes flexible to control ULP strategies with BPF prog.

Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Reviewed-by: Qiao Ma <mqaio@linux.alibaba.com>
---
 include/uapi/linux/bpf.h       |  3 ++-
 net/core/filter.c              | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h |  3 ++-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c26871263f1f..7372283f92be 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2505,7 +2505,8 @@ union bpf_attr {
  * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
  * 		  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  * 		  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
+ *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
+ *		  **TCP_ULP**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  * 	Return
diff --git a/net/core/filter.c b/net/core/filter.c
index 1e6b68ff13db..88d7f047f9c0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4870,6 +4870,14 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 			name[TCP_CA_NAME_MAX-1] = 0;
 			return tcp_set_congestion_control(sk, name, false, true);
 		}
+		case TCP_ULP: {
+			char name[TCP_ULP_NAME_MAX];
+
+			strncpy(name, optval, min_t(long, optlen,
+						    TCP_ULP_NAME_MAX - 1));
+			name[TCP_ULP_NAME_MAX - 1] = 0;
+			return tcp_set_ulp(sk, name);
+		}
 		default:
 			break;
 		}
@@ -5000,6 +5008,14 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
 			strncpy(optval, icsk->icsk_ca_ops->name, optlen);
 			optval[optlen - 1] = 0;
 			break;
+		case TCP_ULP:
+			icsk = inet_csk(sk);
+
+			if (!icsk->icsk_ulp_ops || optlen <= 1)
+				goto err_clear;
+			strncpy(optval, icsk->icsk_ulp_ops->name, optlen);
+			optval[optlen - 1] = 0;
+			break;
 		case TCP_SAVED_SYN:
 			tp = tcp_sk(sk);
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c26871263f1f..7372283f92be 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2505,7 +2505,8 @@ union bpf_attr {
  * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
  * 		  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  * 		  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
+ *		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
+ *		  **TCP_ULP**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  * 	Return
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH bpf-next 1/2] bpf: Use switch statement in _bpf_setsockopt
From: Tony Lu @ 2021-12-09  9:02 UTC (permalink / raw)
  To: ast, daniel, andrii; +Cc: bpf, netdev
In-Reply-To: <20211209090250.73927-1-tonylu@linux.alibaba.com>

This replaces if with switch statement in _bpf_setsockopt() to make it
easy to extend more options.

Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Reviewed-by: Qiao Ma <mqaio@linux.alibaba.com>
---
 net/core/filter.c | 164 ++++++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 80 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index fe27c91e3758..1e6b68ff13db 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4857,95 +4857,99 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 #endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
-		if (optname == TCP_CONGESTION) {
+		struct inet_connection_sock *icsk = inet_csk(sk);
+		struct tcp_sock *tp = tcp_sk(sk);
+		unsigned long timeout;
+
+		switch (optname) {
+		case TCP_CONGESTION: {
 			char name[TCP_CA_NAME_MAX];
 
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, true);
-		} else {
-			struct inet_connection_sock *icsk = inet_csk(sk);
-			struct tcp_sock *tp = tcp_sk(sk);
-			unsigned long timeout;
+			return tcp_set_congestion_control(sk, name, false, true);
+		}
+		default:
+			break;
+		}
 
-			if (optlen != sizeof(int))
-				return -EINVAL;
+		if (optlen != sizeof(int))
+			return -EINVAL;
 
-			val = *((int *)optval);
-			/* Only some options are supported */
-			switch (optname) {
-			case TCP_BPF_IW:
-				if (val <= 0 || tp->data_segs_out > tp->syn_data)
-					ret = -EINVAL;
-				else
-					tp->snd_cwnd = val;
-				break;
-			case TCP_BPF_SNDCWND_CLAMP:
-				if (val <= 0) {
-					ret = -EINVAL;
-				} else {
-					tp->snd_cwnd_clamp = val;
-					tp->snd_ssthresh = val;
-				}
-				break;
-			case TCP_BPF_DELACK_MAX:
-				timeout = usecs_to_jiffies(val);
-				if (timeout > TCP_DELACK_MAX ||
-				    timeout < TCP_TIMEOUT_MIN)
-					return -EINVAL;
-				inet_csk(sk)->icsk_delack_max = timeout;
-				break;
-			case TCP_BPF_RTO_MIN:
-				timeout = usecs_to_jiffies(val);
-				if (timeout > TCP_RTO_MIN ||
-				    timeout < TCP_TIMEOUT_MIN)
-					return -EINVAL;
-				inet_csk(sk)->icsk_rto_min = timeout;
-				break;
-			case TCP_SAVE_SYN:
-				if (val < 0 || val > 1)
-					ret = -EINVAL;
-				else
-					tp->save_syn = val;
-				break;
-			case TCP_KEEPIDLE:
-				ret = tcp_sock_set_keepidle_locked(sk, val);
-				break;
-			case TCP_KEEPINTVL:
-				if (val < 1 || val > MAX_TCP_KEEPINTVL)
-					ret = -EINVAL;
-				else
-					tp->keepalive_intvl = val * HZ;
-				break;
-			case TCP_KEEPCNT:
-				if (val < 1 || val > MAX_TCP_KEEPCNT)
-					ret = -EINVAL;
-				else
-					tp->keepalive_probes = val;
-				break;
-			case TCP_SYNCNT:
-				if (val < 1 || val > MAX_TCP_SYNCNT)
-					ret = -EINVAL;
-				else
-					icsk->icsk_syn_retries = val;
-				break;
-			case TCP_USER_TIMEOUT:
-				if (val < 0)
-					ret = -EINVAL;
-				else
-					icsk->icsk_user_timeout = val;
-				break;
-			case TCP_NOTSENT_LOWAT:
-				tp->notsent_lowat = val;
-				sk->sk_write_space(sk);
-				break;
-			case TCP_WINDOW_CLAMP:
-				ret = tcp_set_window_clamp(sk, val);
-				break;
-			default:
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case TCP_BPF_IW:
+			if (val <= 0 || tp->data_segs_out > tp->syn_data)
+				ret = -EINVAL;
+			else
+				tp->snd_cwnd = val;
+			break;
+		case TCP_BPF_SNDCWND_CLAMP:
+			if (val <= 0) {
 				ret = -EINVAL;
+			} else {
+				tp->snd_cwnd_clamp = val;
+				tp->snd_ssthresh = val;
 			}
+			break;
+		case TCP_BPF_DELACK_MAX:
+			timeout = usecs_to_jiffies(val);
+			if (timeout > TCP_DELACK_MAX ||
+			    timeout < TCP_TIMEOUT_MIN)
+				return -EINVAL;
+			inet_csk(sk)->icsk_delack_max = timeout;
+			break;
+		case TCP_BPF_RTO_MIN:
+			timeout = usecs_to_jiffies(val);
+			if (timeout > TCP_RTO_MIN ||
+			    timeout < TCP_TIMEOUT_MIN)
+				return -EINVAL;
+			inet_csk(sk)->icsk_rto_min = timeout;
+			break;
+		case TCP_SAVE_SYN:
+			if (val < 0 || val > 1)
+				ret = -EINVAL;
+			else
+				tp->save_syn = val;
+			break;
+		case TCP_KEEPIDLE:
+			ret = tcp_sock_set_keepidle_locked(sk, val);
+			break;
+		case TCP_KEEPINTVL:
+			if (val < 1 || val > MAX_TCP_KEEPINTVL)
+				ret = -EINVAL;
+			else
+				tp->keepalive_intvl = val * HZ;
+			break;
+		case TCP_KEEPCNT:
+			if (val < 1 || val > MAX_TCP_KEEPCNT)
+				ret = -EINVAL;
+			else
+				tp->keepalive_probes = val;
+			break;
+		case TCP_SYNCNT:
+			if (val < 1 || val > MAX_TCP_SYNCNT)
+				ret = -EINVAL;
+			else
+				icsk->icsk_syn_retries = val;
+			break;
+		case TCP_USER_TIMEOUT:
+			if (val < 0)
+				ret = -EINVAL;
+			else
+				icsk->icsk_user_timeout = val;
+			break;
+		case TCP_NOTSENT_LOWAT:
+			tp->notsent_lowat = val;
+			sk->sk_write_space(sk);
+			break;
+		case TCP_WINDOW_CLAMP:
+			ret = tcp_set_window_clamp(sk, val);
+			break;
+		default:
+			ret = -EINVAL;
 		}
 #endif
 	} else {
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH bpf-next 0/2] Introduce TCP_ULP option for bpf_{set,get}sockopt
From: Tony Lu @ 2021-12-09  9:02 UTC (permalink / raw)
  To: ast, daniel, andrii; +Cc: bpf, netdev

This patch set introduces a new option TCP_ULP for bpf_{set,get}sockopt
helper. The bpf prog can set and get TCP_ULP sock option on demand.

With this, the bpf prog can set TCP_ULP based on strategies when socket
create or other's socket hook point. For example, the bpf prog can
control which socket should use tls or smc (WIP) ULP modules without
modifying the applications.

Patch 1 replaces if statement with switch to make it easy to extend.

Patch 2 introduces TCP_ULP sock option.

Tony Lu (2):
  bpf: Use switch statement in _bpf_setsockopt
  bpf: Introduce TCP_ULP option for bpf_{set,get}sockopt

 include/uapi/linux/bpf.h       |   3 +-
 net/core/filter.c              | 180 ++++++++++++++++++---------------
 tools/include/uapi/linux/bpf.h |   3 +-
 3 files changed, 104 insertions(+), 82 deletions(-)

-- 
2.32.0.3.g01195cf9f


^ permalink raw reply

* [PATCH net] net/sched: fq_pie: prevent dismantle issue
From: Eric Dumazet @ 2021-12-09  8:49 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, Eric Dumazet, syzbot, Mohit P . Tahiliani,
	Sachin D . Patil, V . Saicharan, Mohit Bhasi, Leslie Monis,
	Gautam Ramakrishnan

From: Eric Dumazet <edumazet@google.com>

For some reason, fq_pie_destroy() did not copy
working code from pie_destroy() and other qdiscs,
thus causing elusive bug.

Before calling del_timer_sync(&q->adapt_timer),
we need to ensure timer will not rearm itself.

rcu: INFO: rcu_preempt self-detected stall on CPU
rcu:    0-....: (4416 ticks this GP) idle=60d/1/0x4000000000000000 softirq=10433/10434 fqs=2579
        (t=10501 jiffies g=13085 q=3989)
NMI backtrace for cpu 0
CPU: 0 PID: 13 Comm: ksoftirqd/0 Not tainted 5.16.0-rc4-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 nmi_cpu_backtrace.cold+0x47/0x144 lib/nmi_backtrace.c:111
 nmi_trigger_cpumask_backtrace+0x1b3/0x230 lib/nmi_backtrace.c:62
 trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline]
 rcu_dump_cpu_stacks+0x25e/0x3f0 kernel/rcu/tree_stall.h:343
 print_cpu_stall kernel/rcu/tree_stall.h:627 [inline]
 check_cpu_stall kernel/rcu/tree_stall.h:711 [inline]
 rcu_pending kernel/rcu/tree.c:3878 [inline]
 rcu_sched_clock_irq.cold+0x9d/0x746 kernel/rcu/tree.c:2597
 update_process_times+0x16d/0x200 kernel/time/timer.c:1785
 tick_sched_handle+0x9b/0x180 kernel/time/tick-sched.c:226
 tick_sched_timer+0x1b0/0x2d0 kernel/time/tick-sched.c:1428
 __run_hrtimer kernel/time/hrtimer.c:1685 [inline]
 __hrtimer_run_queues+0x1c0/0xe50 kernel/time/hrtimer.c:1749
 hrtimer_interrupt+0x31c/0x790 kernel/time/hrtimer.c:1811
 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline]
 __sysvec_apic_timer_interrupt+0x146/0x530 arch/x86/kernel/apic/apic.c:1103
 sysvec_apic_timer_interrupt+0x8e/0xc0 arch/x86/kernel/apic/apic.c:1097
 </IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638
RIP: 0010:write_comp_data kernel/kcov.c:221 [inline]
RIP: 0010:__sanitizer_cov_trace_const_cmp1+0x1d/0x80 kernel/kcov.c:273
Code: 54 c8 20 48 89 10 c3 66 0f 1f 44 00 00 53 41 89 fb 41 89 f1 bf 03 00 00 00 65 48 8b 0c 25 40 70 02 00 48 89 ce 4c 8b 54 24 08 <e8> 4e f7 ff ff 84 c0 74 51 48 8b 81 88 15 00 00 44 8b 81 84 15 00
RSP: 0018:ffffc90000d27b28 EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffff888064bf1bf0 RCX: ffff888011928000
RDX: ffff888011928000 RSI: ffff888011928000 RDI: 0000000000000003
RBP: ffff888064bf1c28 R08: 0000000000000000 R09: 0000000000000000
R10: ffffffff875d8295 R11: 0000000000000000 R12: 0000000000000000
R13: ffff8880783dd300 R14: 0000000000000000 R15: 0000000000000000
 pie_calculate_probability+0x405/0x7c0 net/sched/sch_pie.c:418
 fq_pie_timer+0x170/0x2a0 net/sched/sch_fq_pie.c:383
 call_timer_fn+0x1a5/0x6b0 kernel/time/timer.c:1421
 expire_timers kernel/time/timer.c:1466 [inline]
 __run_timers.part.0+0x675/0xa20 kernel/time/timer.c:1734
 __run_timers kernel/time/timer.c:1715 [inline]
 run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1747
 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
 run_ksoftirqd kernel/softirq.c:921 [inline]
 run_ksoftirqd+0x2d/0x60 kernel/softirq.c:913
 smpboot_thread_fn+0x645/0x9c0 kernel/smpboot.c:164
 kthread+0x405/0x4f0 kernel/kthread.c:327
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
 </TASK>

Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
Cc: Sachin D. Patil <sdp.sachin@gmail.com>
Cc: V. Saicharan <vsaicharan1998@gmail.com>
Cc: Mohit Bhasi <mohitbhasi1998@gmail.com>
Cc: Leslie Monis <lesliemonis@gmail.com>
Cc: Gautam Ramakrishnan <gautamramk@gmail.com>
---
 net/sched/sch_fq_pie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 830f3559f727ad471b595b2f5b04788827da89b0..d6aba6edd16e5eab120a57c316fcb06a5d5f3442 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -531,6 +531,7 @@ static void fq_pie_destroy(struct Qdisc *sch)
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
 
 	tcf_block_put(q->block);
+	q->p_params.tupdate = 0;
 	del_timer_sync(&q->adapt_timer);
 	kvfree(q->flows);
 }
-- 
2.34.1.400.ga245620fadb-goog


^ permalink raw reply related

* [PATCH bpf-next] libbpf: Skip the pinning of global data map for old kernels.
From: Shuyi Cheng @ 2021-12-09  8:44 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools)


Fix error: "failed to pin map: Bad file descriptor, path:
/sys/fs/bpf/_rodata_str1_1."

In the old kernel, the global data map will not be created, see [0]. So
we should skip the pinning of the global data map to avoid 
bpf_object__pin_maps returning error.

[0]: https://lore.kernel.org/bpf/20211123200105.387855-1-andrii@kernel.org

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
---
  tools/lib/bpf/libbpf.c | 4 ++++
  1 file changed, 4 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6db0b5e8540e..d96cf49cebab 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7884,6 +7884,10 @@ int bpf_object__pin_maps(struct bpf_object *obj, 
const char *path)
  		char *pin_path = NULL;
  		char buf[PATH_MAX];

+		if (bpf_map__is_internal(map) &&
+		    !kernel_supports(obj, FEAT_GLOBAL_DATA))
+			continue;
+
  		if (path) {
  			int len;

-- 
2.19.1.6.gb485710b

^ permalink raw reply related

* [PATCH net] Phonet: refcount leak in pep_sock_accep
From: Hangyu Hua @ 2021-12-09  8:28 UTC (permalink / raw)
  To: courmisch, davem, kuba; +Cc: netdev, linux-kernel, Hangyu Hua

sock_hold(sk) is invoked in pep_sock_accept(), but __sock_put(sk) is not
invoked in subsequent failure branches(pep_accept_conn() != 0).

Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
---
 net/phonet/pep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index a1525916885a..b4f90afb0638 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -868,6 +868,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
 
 	err = pep_accept_conn(newsk, skb);
 	if (err) {
+		__sock_put(sk);
 		sock_put(newsk);
 		newsk = NULL;
 		goto drop;
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v4 net-next 2/9] i40e: respect metadata on XSK Rx to skb
From: Jesper Dangaard Brouer @ 2021-12-09  8:27 UTC (permalink / raw)
  To: Alexander Lobakin, intel-wired-lan
  Cc: brouer, Jesse Brandeburg, Tony Nguyen, David S. Miller,
	Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, Björn Töpel,
	Maciej Fijalkowski, Michal Swiatkowski, Jithu Joseph,
	Martin KaFai Lau, Song Liu, KP Singh, Yonghong Song,
	Andrii Nakryiko, netdev, linux-kernel, bpf
In-Reply-To: <20211208140702.642741-3-alexandr.lobakin@intel.com>



On 08/12/2021 15.06, Alexander Lobakin wrote:
> For now, if the XDP prog returns XDP_PASS on XSK, the metadata will
> be lost as it doesn't get copied to the skb.

I have an urge to add a newline here, when reading this, as IMHO it is a 
paragraph with the problem statement.

> Copy it along with the frame headers. Account its size on skb
> allocation, and when copying just treat it as a part of the frame
> and do a pull after to "move" it to the "reserved" zone.

Also newline here, as next paragraph are some extra details, you felt a 
need to explain to the reader.

> net_prefetch() xdp->data_meta and align the copy size to speed-up
> memcpy() a little and better match i40e_costruct_skb().
                                      ^^^^^^xx^^^^^^^^^

You have a general misspelling of this function name in all of your 
commit messages.

--Jesper


^ permalink raw reply

* Re: [PATCH] nfp: Fix memory leak in nfp_cpp_area_cache_add()
From: Simon Horman @ 2021-12-09  8:23 UTC (permalink / raw)
  To: Jianglei Nie; +Cc: kuba, davem, libaokun1, oss-drivers, netdev, linux-kernel
In-Reply-To: <20211209061511.122535-1-niejianglei2021@163.com>

Hi Jianglei,

On Thu, Dec 09, 2021 at 02:15:11PM +0800, Jianglei Nie wrote:
> In line 800 (#1), nfp_cpp_area_alloc() allocates and initializes a
> CPP area structure. But in line 807 (#2), when the cache is allocated
> failed, this CPP area structure is not freed, which will result in
> memory leak.
> 
> We can fix it by freeing the CPP area when the cache is allocated
> failed (#2).
> 
> 792 int nfp_cpp_area_cache_add(struct nfp_cpp *cpp, size_t size)
> 793 {
> 794 	struct nfp_cpp_area_cache *cache;
> 795 	struct nfp_cpp_area *area;
> 
> 800	area = nfp_cpp_area_alloc(cpp, NFP_CPP_ID(7, NFP_CPP_ACTION_RW, 0),
> 801 				  0, size);
> 	// #1: allocates and initializes
> 
> 802 	if (!area)
> 803 		return -ENOMEM;
> 
> 805 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
> 806 	if (!cache)
> 807 		return -ENOMEM; // #2: missing free
> 
> 817	return 0;
> 818 }
> 
> Signed-off-by: Jianglei Nie <niejianglei2021@163.com>

Thanks for noticing this. I agree that this seems to be incorrect
and that your patch addresses the problem.

I do wonder if there is a value in adding:

Fixes: 4cb584e0ee7d ("nfp: add CPP access core")

Also, as I don't think this is hurting anything in practice, perhaps
this is for net-next (as oppoed to net), which is not specified
in the patch subject.

Regardless,

Acked-by: Simon Horman <simon.horman@corigine.com>

> ---
>  drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
> index d7ac0307797f..34c0d2ddf9ef 100644
> --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
> +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
> @@ -803,8 +803,10 @@ int nfp_cpp_area_cache_add(struct nfp_cpp *cpp, size_t size)
>  		return -ENOMEM;
>  
>  	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
> -	if (!cache)
> +	if (!cache) {
> +		nfp_cpp_area_free(area);
>  		return -ENOMEM;
> +	}
>  
>  	cache->id = 0;
>  	cache->addr = 0;
> -- 
> 2.25.1
> 

^ permalink raw reply

* Re: [PATCH v4 net-next 1/9] i40e: don't reserve excessive XDP_PACKET_HEADROOM on XSK Rx to skb
From: Jesper Dangaard Brouer @ 2021-12-09  8:19 UTC (permalink / raw)
  To: Alexander Lobakin, intel-wired-lan
  Cc: brouer, Jesse Brandeburg, Tony Nguyen, David S. Miller,
	Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
	Jesper Dangaard Brouer, John Fastabend, Björn Töpel,
	Maciej Fijalkowski, Michal Swiatkowski, Jithu Joseph,
	Martin KaFai Lau, Song Liu, KP Singh, Yonghong Song,
	Andrii Nakryiko, netdev, linux-kernel, bpf
In-Reply-To: <20211208140702.642741-2-alexandr.lobakin@intel.com>



On 08/12/2021 15.06, Alexander Lobakin wrote:
> {__,}napi_alloc_skb() allocates and reserves additional NET_SKB_PAD
> + NET_IP_ALIGN for any skb.
> OTOH, i40e_construct_skb_zc() currently allocates and reserves
> additional `xdp->data - xdp->data_hard_start`, which is
> XDP_PACKET_HEADROOM for XSK frames.
> There's no need for that at all as the frame is post-XDP and will
> go only to the networking stack core.

I disagree with this assumption, that headroom is not needed by netstack.
Why "no need for that at all" for netstack?

Having headroom is important for netstack in general.  When packet will 
grow we avoid realloc of SKB.  Use-case could also be cpumap or veth 
redirect, or XDP-generic, that expect this headroom.


> Pass the size of the actual data only to __napi_alloc_skb() and
> don't reserve anything. This will give enough headroom for stack
> processing.
> 
> Fixes: 0a714186d3c0 ("i40e: add AF_XDP zero-copy Rx support")
> Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
> Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
> ---
>   drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 +---
>   1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index f08d19b8c554..9564906b7da8 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -245,13 +245,11 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
>   	struct sk_buff *skb;
>   
>   	/* allocate a skb to store the frags */
> -	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
> -			       xdp->data_end - xdp->data_hard_start,
> +	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize,
>   			       GFP_ATOMIC | __GFP_NOWARN);
>   	if (unlikely(!skb))
>   		goto out;
>   
> -	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>   	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
>   	if (metasize)
>   		skb_metadata_set(skb, metasize);
> 


^ permalink raw reply

* [PATCH net 0/2] pull-request: can 2021-12-09
From: Marc Kleine-Budde @ 2021-12-09  8:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, kuba, linux-can, kernel

Hello Jakub, hello David,

this is a pull request of 2 patches for net/master.

Both patches are by Jimmy Assarsson. The first one fixes the
incrementing of the rx/tx error counters in the Kvaser PCIe FD driver.
The second one fixes the Kvaser USB driver by using the CAN clock
frequency provided by the device instead of using a hard coded value.

regards,
Marc

---
The following changes since commit a50e659b2a1be14784e80f8492aab177e67c53a2:

  net: mvpp2: fix XDP rx queues registering (2021-12-08 18:29:37 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git tags/linux-can-fixes-for-5.16-20211209

for you to fetch changes up to fb12797ab1fef480ad8a32a30984844444eeb00d:

  can: kvaser_usb: get CAN clock frequency from device (2021-12-09 09:01:43 +0100)

----------------------------------------------------------------
linux-can-fixes-for-5.16-20211209

----------------------------------------------------------------
Jimmy Assarsson (2):
      can: kvaser_pciefd: kvaser_pciefd_rx_error_frame(): increase correct stats->{rx,tx}_errors counter
      can: kvaser_usb: get CAN clock frequency from device

 drivers/net/can/kvaser_pciefd.c                  |   8 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c | 101 ++++++++++++++++-------
 2 files changed, 80 insertions(+), 29 deletions(-)



^ permalink raw reply

* [PATCH net 1/2] can: kvaser_pciefd: kvaser_pciefd_rx_error_frame(): increase correct stats->{rx,tx}_errors counter
From: Marc Kleine-Budde @ 2021-12-09  8:13 UTC (permalink / raw)
  To: netdev
  Cc: davem, kuba, linux-can, kernel, Jimmy Assarsson, stable,
	Marc Kleine-Budde
In-Reply-To: <20211209081312.301036-1-mkl@pengutronix.de>

From: Jimmy Assarsson <extja@kvaser.com>

Check the direction bit in the error frame packet (EPACK) to determine
which net_device_stats {rx,tx}_errors counter to increase.

Fixes: 26ad340e582d ("can: kvaser_pciefd: Add driver for Kvaser PCIEcan devices")
Link: https://lore.kernel.org/all/20211208152122.250852-1-extja@kvaser.com
Cc: stable@vger.kernel.org
Signed-off-by: Jimmy Assarsson <extja@kvaser.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/kvaser_pciefd.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 74d9899fc904..eb74cdf26b88 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -248,6 +248,9 @@ MODULE_DESCRIPTION("CAN driver for Kvaser CAN/PCIe devices");
 #define KVASER_PCIEFD_SPACK_EWLR BIT(23)
 #define KVASER_PCIEFD_SPACK_EPLR BIT(24)
 
+/* Kvaser KCAN_EPACK second word */
+#define KVASER_PCIEFD_EPACK_DIR_TX BIT(0)
+
 struct kvaser_pciefd;
 
 struct kvaser_pciefd_can {
@@ -1285,7 +1288,10 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 
 	can->err_rep_cnt++;
 	can->can.can_stats.bus_error++;
-	stats->rx_errors++;
+	if (p->header[1] & KVASER_PCIEFD_EPACK_DIR_TX)
+		stats->tx_errors++;
+	else
+		stats->rx_errors++;
 
 	can->bec.txerr = bec.txerr;
 	can->bec.rxerr = bec.rxerr;

base-commit: a50e659b2a1be14784e80f8492aab177e67c53a2
-- 
2.33.0



^ permalink raw reply related

* [PATCH net 2/2] can: kvaser_usb: get CAN clock frequency from device
From: Marc Kleine-Budde @ 2021-12-09  8:13 UTC (permalink / raw)
  To: netdev
  Cc: davem, kuba, linux-can, kernel, Jimmy Assarsson, stable,
	Marc Kleine-Budde
In-Reply-To: <20211209081312.301036-1-mkl@pengutronix.de>

From: Jimmy Assarsson <extja@kvaser.com>

The CAN clock frequency is used when calculating the CAN bittiming
parameters. When wrong clock frequency is used, the device may end up
with wrong bittiming parameters, depending on user requested bittiming
parameters.

To avoid this, get the CAN clock frequency from the device. Various
existing Kvaser Leaf products use different CAN clocks.

Fixes: 080f40a6fa28 ("can: kvaser_usb: Add support for Kvaser CAN/USB devices")
Link: https://lore.kernel.org/all/20211208152122.250852-2-extja@kvaser.com
Cc: stable@vger.kernel.org
Signed-off-by: Jimmy Assarsson <extja@kvaser.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 .../net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 101 +++++++++++++-----
 1 file changed, 73 insertions(+), 28 deletions(-)

diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index 59ba7c7beec0..f7af1bf5ab46 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -28,10 +28,6 @@
 
 #include "kvaser_usb.h"
 
-/* Forward declaration */
-static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg;
-
-#define CAN_USB_CLOCK			8000000
 #define MAX_USBCAN_NET_DEVICES		2
 
 /* Command header size */
@@ -80,6 +76,12 @@ static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg;
 
 #define CMD_LEAF_LOG_MESSAGE		106
 
+/* Leaf frequency options */
+#define KVASER_USB_LEAF_SWOPTION_FREQ_MASK 0x60
+#define KVASER_USB_LEAF_SWOPTION_FREQ_16_MHZ_CLK 0
+#define KVASER_USB_LEAF_SWOPTION_FREQ_32_MHZ_CLK BIT(5)
+#define KVASER_USB_LEAF_SWOPTION_FREQ_24_MHZ_CLK BIT(6)
+
 /* error factors */
 #define M16C_EF_ACKE			BIT(0)
 #define M16C_EF_CRCE			BIT(1)
@@ -340,6 +342,50 @@ struct kvaser_usb_err_summary {
 	};
 };
 
+static const struct can_bittiming_const kvaser_usb_leaf_bittiming_const = {
+	.name = "kvaser_usb",
+	.tseg1_min = KVASER_USB_TSEG1_MIN,
+	.tseg1_max = KVASER_USB_TSEG1_MAX,
+	.tseg2_min = KVASER_USB_TSEG2_MIN,
+	.tseg2_max = KVASER_USB_TSEG2_MAX,
+	.sjw_max = KVASER_USB_SJW_MAX,
+	.brp_min = KVASER_USB_BRP_MIN,
+	.brp_max = KVASER_USB_BRP_MAX,
+	.brp_inc = KVASER_USB_BRP_INC,
+};
+
+static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg_8mhz = {
+	.clock = {
+		.freq = 8000000,
+	},
+	.timestamp_freq = 1,
+	.bittiming_const = &kvaser_usb_leaf_bittiming_const,
+};
+
+static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg_16mhz = {
+	.clock = {
+		.freq = 16000000,
+	},
+	.timestamp_freq = 1,
+	.bittiming_const = &kvaser_usb_leaf_bittiming_const,
+};
+
+static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg_24mhz = {
+	.clock = {
+		.freq = 24000000,
+	},
+	.timestamp_freq = 1,
+	.bittiming_const = &kvaser_usb_leaf_bittiming_const,
+};
+
+static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg_32mhz = {
+	.clock = {
+		.freq = 32000000,
+	},
+	.timestamp_freq = 1,
+	.bittiming_const = &kvaser_usb_leaf_bittiming_const,
+};
+
 static void *
 kvaser_usb_leaf_frame_to_cmd(const struct kvaser_usb_net_priv *priv,
 			     const struct sk_buff *skb, int *frame_len,
@@ -471,6 +517,27 @@ static int kvaser_usb_leaf_send_simple_cmd(const struct kvaser_usb *dev,
 	return rc;
 }
 
+static void kvaser_usb_leaf_get_software_info_leaf(struct kvaser_usb *dev,
+						   const struct leaf_cmd_softinfo *softinfo)
+{
+	u32 sw_options = le32_to_cpu(softinfo->sw_options);
+
+	dev->fw_version = le32_to_cpu(softinfo->fw_version);
+	dev->max_tx_urbs = le16_to_cpu(softinfo->max_outstanding_tx);
+
+	switch (sw_options & KVASER_USB_LEAF_SWOPTION_FREQ_MASK) {
+	case KVASER_USB_LEAF_SWOPTION_FREQ_16_MHZ_CLK:
+		dev->cfg = &kvaser_usb_leaf_dev_cfg_16mhz;
+		break;
+	case KVASER_USB_LEAF_SWOPTION_FREQ_24_MHZ_CLK:
+		dev->cfg = &kvaser_usb_leaf_dev_cfg_24mhz;
+		break;
+	case KVASER_USB_LEAF_SWOPTION_FREQ_32_MHZ_CLK:
+		dev->cfg = &kvaser_usb_leaf_dev_cfg_32mhz;
+		break;
+	}
+}
+
 static int kvaser_usb_leaf_get_software_info_inner(struct kvaser_usb *dev)
 {
 	struct kvaser_cmd cmd;
@@ -486,14 +553,13 @@ static int kvaser_usb_leaf_get_software_info_inner(struct kvaser_usb *dev)
 
 	switch (dev->card_data.leaf.family) {
 	case KVASER_LEAF:
-		dev->fw_version = le32_to_cpu(cmd.u.leaf.softinfo.fw_version);
-		dev->max_tx_urbs =
-			le16_to_cpu(cmd.u.leaf.softinfo.max_outstanding_tx);
+		kvaser_usb_leaf_get_software_info_leaf(dev, &cmd.u.leaf.softinfo);
 		break;
 	case KVASER_USBCAN:
 		dev->fw_version = le32_to_cpu(cmd.u.usbcan.softinfo.fw_version);
 		dev->max_tx_urbs =
 			le16_to_cpu(cmd.u.usbcan.softinfo.max_outstanding_tx);
+		dev->cfg = &kvaser_usb_leaf_dev_cfg_8mhz;
 		break;
 	}
 
@@ -1225,24 +1291,11 @@ static int kvaser_usb_leaf_init_card(struct kvaser_usb *dev)
 {
 	struct kvaser_usb_dev_card_data *card_data = &dev->card_data;
 
-	dev->cfg = &kvaser_usb_leaf_dev_cfg;
 	card_data->ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES;
 
 	return 0;
 }
 
-static const struct can_bittiming_const kvaser_usb_leaf_bittiming_const = {
-	.name = "kvaser_usb",
-	.tseg1_min = KVASER_USB_TSEG1_MIN,
-	.tseg1_max = KVASER_USB_TSEG1_MAX,
-	.tseg2_min = KVASER_USB_TSEG2_MIN,
-	.tseg2_max = KVASER_USB_TSEG2_MAX,
-	.sjw_max = KVASER_USB_SJW_MAX,
-	.brp_min = KVASER_USB_BRP_MIN,
-	.brp_max = KVASER_USB_BRP_MAX,
-	.brp_inc = KVASER_USB_BRP_INC,
-};
-
 static int kvaser_usb_leaf_set_bittiming(struct net_device *netdev)
 {
 	struct kvaser_usb_net_priv *priv = netdev_priv(netdev);
@@ -1348,11 +1401,3 @@ const struct kvaser_usb_dev_ops kvaser_usb_leaf_dev_ops = {
 	.dev_read_bulk_callback = kvaser_usb_leaf_read_bulk_callback,
 	.dev_frame_to_cmd = kvaser_usb_leaf_frame_to_cmd,
 };
-
-static const struct kvaser_usb_dev_cfg kvaser_usb_leaf_dev_cfg = {
-	.clock = {
-		.freq = CAN_USB_CLOCK,
-	},
-	.timestamp_freq = 1,
-	.bittiming_const = &kvaser_usb_leaf_bittiming_const,
-};
-- 
2.33.0



^ permalink raw reply related

* [PATCH] nfc: fix potential NULL pointer deref in nfc_genl_dump_ses_done
From: Krzysztof Kozlowski @ 2021-12-09  8:13 UTC (permalink / raw)
  To: Krzysztof Kozlowski, David S. Miller, Jakub Kicinski,
	Samuel Ortiz, linux-nfc, netdev, linux-kernel
  Cc: tadeusz.struk, stable

The done() netlink callback nfc_genl_dump_ses_done() should check if
received argument is non-NULL, because its allocation could fail earlier
in dumpit() (nfc_genl_dump_ses()).

Fixes: ac22ac466a65 ("NFC: Add a GET_SE netlink API")
Cc: <stable@vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 net/nfc/netlink.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 334f63c9529e..5c706ed75b33 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1392,8 +1392,10 @@ static int nfc_genl_dump_ses_done(struct netlink_callback *cb)
 {
 	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
 
-	nfc_device_iter_exit(iter);
-	kfree(iter);
+	if (iter) {
+		nfc_device_iter_exit(iter);
+		kfree(iter);
+	}
 
 	return 0;
 }
-- 
2.32.0


^ permalink raw reply related

* Re: [PATCH] nfc: fix segfault in nfc_genl_dump_devices_done
From: Krzysztof Kozlowski @ 2021-12-09  8:12 UTC (permalink / raw)
  To: Tadeusz Struk, netdev
  Cc: David S. Miller, Jakub Kicinski, stable,
	syzbot+f9f76f4a0766420b4a02
In-Reply-To: <20211208182742.340542-1-tadeusz.struk@linaro.org>

On 08/12/2021 19:27, Tadeusz Struk wrote:
> When kmalloc in nfc_genl_dump_devices() fails then
> nfc_genl_dump_devices_done() segfaults as below
> 
> KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
> CPU: 0 PID: 25 Comm: kworker/0:1 Not tainted 5.16.0-rc4-01180-g2a987e65025e-dirty #5
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-6.fc35 04/01/2014
> Workqueue: events netlink_sock_destruct_work
> RIP: 0010:klist_iter_exit+0x26/0x80
> Call Trace:
> <TASK>
> class_dev_iter_exit+0x15/0x20
> nfc_genl_dump_devices_done+0x3b/0x50
> genl_lock_done+0x84/0xd0
> netlink_sock_destruct+0x8f/0x270
> __sk_destruct+0x64/0x3b0
> sk_destruct+0xa8/0xd0
> __sk_free+0x2e8/0x3d0
> sk_free+0x51/0x90
> netlink_sock_destruct_work+0x1c/0x20
> process_one_work+0x411/0x710
> worker_thread+0x6fd/0xa80
> 
> Link: https://syzkaller.appspot.com/bug?id=fc0fa5a53db9edd261d56e74325419faf18bd0df
> Cc: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: netdev@vger.kernel.org
> Cc: stable@vger.kernel.org
> Reported-by: syzbot+f9f76f4a0766420b4a02@syzkaller.appspotmail.com
> Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
> ---
>  net/nfc/netlink.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 

Cc: <stable@vger.kernel.org>
Fixes: 4d12b8b129f1 ("NFC: add nfc generic netlink interface")

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>

I will fix other similar cases.

Best regards,
Krzysztof

^ permalink raw reply

* Re: [PATCH V6 5/5] net: netvsc: Add Isolation VM support for netvsc driver
From: Tianyu Lan @ 2021-12-09  8:08 UTC (permalink / raw)
  To: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	wei.liu@kernel.org, Dexuan Cui, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com,
	x86@kernel.org, hpa@zytor.com, davem@davemloft.net,
	kuba@kernel.org, jejb@linux.ibm.com, martin.petersen@oracle.com,
	arnd@arndb.de, hch@infradead.org, m.szyprowski@samsung.com,
	robin.murphy@arm.com, Tianyu Lan, thomas.lendacky@amd.com,
	Michael Kelley (LINUX)
  Cc: iommu@lists.linux-foundation.org, linux-arch@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-scsi@vger.kernel.org, netdev@vger.kernel.org, vkuznets,
	brijesh.singh@amd.com, konrad.wilk@oracle.com, hch@lst.de,
	joro@8bytes.org, parri.andrea@gmail.com, dave.hansen@intel.com
In-Reply-To: <BN8PR21MB128401EEDE6B8C8553CC8009CA6F9@BN8PR21MB1284.namprd21.prod.outlook.com>



On 12/9/2021 4:14 AM, Haiyang Zhang wrote:
>> From: Tianyu Lan <ltykernel@gmail.com>
>> Sent: Tuesday, December 7, 2021 2:56 AM
>> To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Stephen
>> Hemminger <sthemmin@microsoft.com>; wei.liu@kernel.org; Dexuan Cui <decui@microsoft.com>;
>> tglx@linutronix.de; mingo@redhat.com; bp@alien8.de; dave.hansen@linux.intel.com;
>> x86@kernel.org; hpa@zytor.com; davem@davemloft.net; kuba@kernel.org; jejb@linux.ibm.com;
>> martin.petersen@oracle.com; arnd@arndb.de; hch@infradead.org; m.szyprowski@samsung.com;
>> robin.murphy@arm.com; Tianyu Lan <Tianyu.Lan@microsoft.com>; thomas.lendacky@amd.com;
>> Michael Kelley (LINUX) <mikelley@microsoft.com>
>> Cc: iommu@lists.linux-foundation.org; linux-arch@vger.kernel.org; linux-
>> hyperv@vger.kernel.org; linux-kernel@vger.kernel.org; linux-scsi@vger.kernel.org;
>> netdev@vger.kernel.org; vkuznets <vkuznets@redhat.com>; brijesh.singh@amd.com;
>> konrad.wilk@oracle.com; hch@lst.de; joro@8bytes.org; parri.andrea@gmail.com;
>> dave.hansen@intel.com
>> Subject: [PATCH V6 5/5] net: netvsc: Add Isolation VM support for netvsc driver
>>
>> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
>>
>> In Isolation VM, all shared memory with host needs to mark visible
>> to host via hvcall. vmbus_establish_gpadl() has already done it for
>> netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
>> pagebuffer() stills need to be handled. Use DMA API to map/umap
>> these memory during sending/receiving packet and Hyper-V swiotlb
>> bounce buffer dma address will be returned. The swiotlb bounce buffer
>> has been masked to be visible to host during boot up.
>>
>> rx/tx ring buffer is allocated via vzalloc() and they need to be
>> mapped into unencrypted address space(above vTOM) before sharing
>> with host and accessing. Add hv_map/unmap_memory() to map/umap rx
>> /tx ring buffer.
>>
>> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
>> ---
>> Change since v3:
>>         * Replace HV_HYP_PAGE_SIZE with PAGE_SIZE and virt_to_hvpfn()
>>           with vmalloc_to_pfn() in the hv_map_memory()
>>
>> Change since v2:
>>         * Add hv_map/unmap_memory() to map/umap rx/tx ring buffer.
>> ---
>>   arch/x86/hyperv/ivm.c             |  28 ++++++
>>   drivers/hv/hv_common.c            |  11 +++
>>   drivers/net/hyperv/hyperv_net.h   |   5 ++
>>   drivers/net/hyperv/netvsc.c       | 136 +++++++++++++++++++++++++++++-
>>   drivers/net/hyperv/netvsc_drv.c   |   1 +
>>   drivers/net/hyperv/rndis_filter.c |   2 +
>>   include/asm-generic/mshyperv.h    |   2 +
>>   include/linux/hyperv.h            |   5 ++
>>   8 files changed, 187 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
>> index 69c7a57f3307..2b994117581e 100644
>> --- a/arch/x86/hyperv/ivm.c
>> +++ b/arch/x86/hyperv/ivm.c
>> @@ -287,3 +287,31 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount,
>> bool visibl
>>   	kfree(pfn_array);
>>   	return ret;
>>   }
>> +
>> +/*
>> + * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
>> + */
>> +void *hv_map_memory(void *addr, unsigned long size)
>> +{
>> +	unsigned long *pfns = kcalloc(size / PAGE_SIZE,
>> +				      sizeof(unsigned long), GFP_KERNEL);
>> +	void *vaddr;
>> +	int i;
>> +
>> +	if (!pfns)
>> +		return NULL;
>> +
>> +	for (i = 0; i < size / PAGE_SIZE; i++)
>> +		pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
>> +			(ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
>> +
>> +	vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
>> +	kfree(pfns);
>> +
>> +	return vaddr;
>> +}
>> +
>> +void hv_unmap_memory(void *addr)
>> +{
>> +	vunmap(addr);
>> +}
>> diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
>> index 7be173a99f27..3c5cb1f70319 100644
>> --- a/drivers/hv/hv_common.c
>> +++ b/drivers/hv/hv_common.c
>> @@ -295,3 +295,14 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output,
>> u32 input_s
>>   	return HV_STATUS_INVALID_PARAMETER;
>>   }
>>   EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
>> +
>> +void __weak *hv_map_memory(void *addr, unsigned long size)
>> +{
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_map_memory);
>> +
>> +void __weak hv_unmap_memory(void *addr)
>> +{
>> +}
>> +EXPORT_SYMBOL_GPL(hv_unmap_memory);
>> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
>> index 315278a7cf88..cf69da0e296c 100644
>> --- a/drivers/net/hyperv/hyperv_net.h
>> +++ b/drivers/net/hyperv/hyperv_net.h
>> @@ -164,6 +164,7 @@ struct hv_netvsc_packet {
>>   	u32 total_bytes;
>>   	u32 send_buf_index;
>>   	u32 total_data_buflen;
>> +	struct hv_dma_range *dma_range;
>>   };
>>
>>   #define NETVSC_HASH_KEYLEN 40
>> @@ -1074,6 +1075,7 @@ struct netvsc_device {
>>
>>   	/* Receive buffer allocated by us but manages by NetVSP */
>>   	void *recv_buf;
>> +	void *recv_original_buf;
>>   	u32 recv_buf_size; /* allocated bytes */
>>   	struct vmbus_gpadl recv_buf_gpadl_handle;
>>   	u32 recv_section_cnt;
>> @@ -1082,6 +1084,7 @@ struct netvsc_device {
>>
>>   	/* Send buffer allocated by us */
>>   	void *send_buf;
>> +	void *send_original_buf;
>>   	u32 send_buf_size;
>>   	struct vmbus_gpadl send_buf_gpadl_handle;
>>   	u32 send_section_cnt;
>> @@ -1731,4 +1734,6 @@ struct rndis_message {
>>   #define RETRY_US_HI	10000
>>   #define RETRY_MAX	2000	/* >10 sec */
>>
>> +void netvsc_dma_unmap(struct hv_device *hv_dev,
>> +		      struct hv_netvsc_packet *packet);
>>   #endif /* _HYPERV_NET_H */
>> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
>> index 396bc1c204e6..b7ade735a806 100644
>> --- a/drivers/net/hyperv/netvsc.c
>> +++ b/drivers/net/hyperv/netvsc.c
>> @@ -153,8 +153,21 @@ static void free_netvsc_device(struct rcu_head *head)
>>   	int i;
>>
>>   	kfree(nvdev->extension);
>> -	vfree(nvdev->recv_buf);
>> -	vfree(nvdev->send_buf);
>> +
>> +	if (nvdev->recv_original_buf) {
>> +		hv_unmap_memory(nvdev->recv_buf);
>> +		vfree(nvdev->recv_original_buf);
>> +	} else {
>> +		vfree(nvdev->recv_buf);
>> +	}
>> +
>> +	if (nvdev->send_original_buf) {
>> +		hv_unmap_memory(nvdev->send_buf);
>> +		vfree(nvdev->send_original_buf);
>> +	} else {
>> +		vfree(nvdev->send_buf);
>> +	}
>> +
>>   	kfree(nvdev->send_section_map);
>>
>>   	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
>> @@ -338,6 +351,7 @@ static int netvsc_init_buf(struct hv_device *device,
>>   	unsigned int buf_size;
>>   	size_t map_words;
>>   	int i, ret = 0;
>> +	void *vaddr;
>>
>>   	/* Get receive buffer area. */
>>   	buf_size = device_info->recv_sections * device_info->recv_section_size;
>> @@ -373,6 +387,17 @@ static int netvsc_init_buf(struct hv_device *device,
>>   		goto cleanup;
>>   	}
>>
>> +	if (hv_isolation_type_snp()) {
>> +		vaddr = hv_map_memory(net_device->recv_buf, buf_size);
>> +		if (!vaddr) {
>> +			ret = -ENOMEM;
>> +			goto cleanup;
>> +		}
>> +
>> +		net_device->recv_original_buf = net_device->recv_buf;
>> +		net_device->recv_buf = vaddr;
>> +	}
>> +
>>   	/* Notify the NetVsp of the gpadl handle */
>>   	init_packet = &net_device->channel_init_pkt;
>>   	memset(init_packet, 0, sizeof(struct nvsp_message));
>> @@ -476,6 +501,17 @@ static int netvsc_init_buf(struct hv_device *device,
>>   		goto cleanup;
>>   	}
>>
>> +	if (hv_isolation_type_snp()) {
>> +		vaddr = hv_map_memory(net_device->send_buf, buf_size);
>> +		if (!vaddr) {
>> +			ret = -ENOMEM;
>> +			goto cleanup;
>> +		}
>> +
>> +		net_device->send_original_buf = net_device->send_buf;
>> +		net_device->send_buf = vaddr;
>> +	}
>> +
>>   	/* Notify the NetVsp of the gpadl handle */
>>   	init_packet = &net_device->channel_init_pkt;
>>   	memset(init_packet, 0, sizeof(struct nvsp_message));
>> @@ -766,7 +802,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
>>
>>   	/* Notify the layer above us */
>>   	if (likely(skb)) {
>> -		const struct hv_netvsc_packet *packet
>> +		struct hv_netvsc_packet *packet
>>   			= (struct hv_netvsc_packet *)skb->cb;
>>   		u32 send_index = packet->send_buf_index;
>>   		struct netvsc_stats *tx_stats;
>> @@ -782,6 +818,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
>>   		tx_stats->bytes += packet->total_bytes;
>>   		u64_stats_update_end(&tx_stats->syncp);
>>
>> +		netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
>>   		napi_consume_skb(skb, budget);
>>   	}
>>
>> @@ -946,6 +983,88 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
>>   		memset(dest, 0, padding);
>>   }
>>
>> +void netvsc_dma_unmap(struct hv_device *hv_dev,
>> +		      struct hv_netvsc_packet *packet)
>> +{
>> +	u32 page_count = packet->cp_partial ?
>> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
>> +		packet->page_buf_cnt;
>> +	int i;
>> +
>> +	if (!hv_is_isolation_supported())
>> +		return;
>> +
>> +	if (!packet->dma_range)
>> +		return;
>> +
>> +	for (i = 0; i < page_count; i++)
>> +		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
>> +				 packet->dma_range[i].mapping_size,
>> +				 DMA_TO_DEVICE);
>> +
>> +	kfree(packet->dma_range);
>> +}
>> +
>> +/* netvsc_dma_map - Map swiotlb bounce buffer with data page of
>> + * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
>> + * VM.
>> + *
>> + * In isolation VM, netvsc send buffer has been marked visible to
>> + * host and so the data copied to send buffer doesn't need to use
>> + * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
>> + * may not be copied to send buffer and so these pages need to be
>> + * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
>> + * that. The pfns in the struct hv_page_buffer need to be converted
>> + * to bounce buffer's pfn. The loop here is necessary because the
>> + * entries in the page buffer array are not necessarily full
>> + * pages of data.  Each entry in the array has a separate offset and
>> + * len that may be non-zero, even for entries in the middle of the
>> + * array.  And the entries are not physically contiguous.  So each
>> + * entry must be individually mapped rather than as a contiguous unit.
>> + * So not use dma_map_sg() here.
>> + */
>> +int netvsc_dma_map(struct hv_device *hv_dev,
>> +		   struct hv_netvsc_packet *packet,
>> +		   struct hv_page_buffer *pb)
>> +{
>> +	u32 page_count =  packet->cp_partial ?
>> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
>> +		packet->page_buf_cnt;
>> +	dma_addr_t dma;
>> +	int i;
>> +
>> +	if (!hv_is_isolation_supported())
>> +		return 0;
>> +
>> +	packet->dma_range = kcalloc(page_count,
>> +				    sizeof(*packet->dma_range),
>> +				    GFP_KERNEL);
>> +	if (!packet->dma_range)
>> +		return -ENOMEM;
>> +
>> +	for (i = 0; i < page_count; i++) {
>> +		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
>> +					 + pb[i].offset);
>> +		u32 len = pb[i].len;
>> +
>> +		dma = dma_map_single(&hv_dev->device, src, len,
>> +				     DMA_TO_DEVICE);
>> +		if (dma_mapping_error(&hv_dev->device, dma)) {
>> +			kfree(packet->dma_range);
>> +			return -ENOMEM;
>> +		}
>> +
>> +		/* pb[].offset and pb[].len are not changed during dma mapping
>> +		 * and so not reassign.
>> +		 */
>> +		packet->dma_range[i].dma = dma;
>> +		packet->dma_range[i].mapping_size = len;
>> +		pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static inline int netvsc_send_pkt(
>>   	struct hv_device *device,
>>   	struct hv_netvsc_packet *packet,
>> @@ -986,14 +1105,24 @@ static inline int netvsc_send_pkt(
>>
>>   	trace_nvsp_send_pkt(ndev, out_channel, rpkt);
>>
>> +	packet->dma_range = NULL;
>>   	if (packet->page_buf_cnt) {
>>   		if (packet->cp_partial)
>>   			pb += packet->rmsg_pgcnt;
>>
>> +		ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
>> +		if (ret) {
>> +			ret = -EAGAIN;
>> +			goto exit;
>> +		}
> 
> Returning EAGAIN will let the upper network layer busy retry,
> which may make things worse.
> I suggest to return ENOSPC here like another place in this
> function, which will just drop the packet, and let the network
> protocol/app layer decide how to recover.
> 

Yes, agree. Will update in the next version. Thanks.


^ permalink raw reply

* Re: [PATCH net-next 10/13] net/smc: add net device tracker to struct smc_pnetentry
From: Karsten Graul @ 2021-12-09  8:05 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, linux-s390
In-Reply-To: <20211207013039.1868645-11-eric.dumazet@gmail.com>

On 07/12/2021 02:30, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/smc/smc_pnet.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 

Acked-by: Karsten Graul <kgraul@linux.ibm.com>

^ permalink raw reply

* [PATCHv2 bpf-next] samples/bpf:remove unneeded variable
From: cgel.zte @ 2021-12-09  8:00 UTC (permalink / raw)
  To: andrii.nakryiko
  Cc: andrii, ast, bpf, cgel.zte, chi.minghao, daniel, davem, hawk,
	john.fastabend, kafai, kpsingh, kuba, linux-kernel, netdev,
	songliubraving, yhs, zealci
In-Reply-To: <CAEf4BzacuM8cR8Xuv5tdg7=KScVi26pZ3CjewAy=UuHouiRZdg@mail.gmail.com>

From: Minghao Chi <chi.minghao@zte.com.cn>

return value form directly instead of
taking this in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cm>
Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn>
---
 samples/bpf/xdp_redirect_cpu.bpf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/samples/bpf/xdp_redirect_cpu.bpf.c b/samples/bpf/xdp_redirect_cpu.bpf.c
index f10fe3cf25f6..25e3a405375f 100644
--- a/samples/bpf/xdp_redirect_cpu.bpf.c
+++ b/samples/bpf/xdp_redirect_cpu.bpf.c
@@ -100,7 +100,6 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
 	void *data     = (void *)(long)ctx->data;
 	struct iphdr *iph = data + nh_off;
 	struct udphdr *udph;
-	u16 dport;
 
 	if (iph + 1 > data_end)
 		return 0;
@@ -111,8 +110,7 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
 	if (udph + 1 > data_end)
 		return 0;
 
-	dport = bpf_ntohs(udph->dest);
-	return dport;
+	return bpf_ntohs(udph->dest);
 }
 
 static __always_inline
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2 net-next] tcp: Warn if sock_owned_by_user() is true in tcp_child_process().
From: Eric Dumazet @ 2021-12-09  8:00 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, Jakub Kicinski, Benjamin Herrenschmidt,
	Kuniyuki Iwashima, netdev
In-Reply-To: <20211209013250.44347-1-kuniyu@amazon.co.jp>

On Wed, Dec 8, 2021 at 5:33 PM Kuniyuki Iwashima <kuniyu@amazon.co.jp> wrote:
>
> While creating a child socket from ACK (not TCP Fast Open case), before
> v2.3.41, we used to call bh_lock_sock() later than now; it was called just
> before tcp_rcv_state_process().  The full socket was put into an accept
> queue and exposed to other CPUs before bh_lock_sock() so that process
> context might have acquired the lock by then.  Thus, we had to check if any
> process context was accessing the socket before tcp_rcv_state_process().
>

I think you misunderstood me.

I think this code is not dead yet, so I would :

Not include a Fixes: tag to avoid unnecessary backports (of a patch
and its revert)

If you want to get syzbot coverage for few releases, especially with
MPTCP and synflood,
you  can then submit a patch like the following.

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cf913a66df17..19da6e442fca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -843,6 +843,9 @@ int tcp_child_process(struct sock *parent, struct
sock *child,
                 * in main socket hash table and lock on listening
                 * socket does not protect us more.
                 */
+
+               /* Check if this code path is obsolete ? */
+               WARN_ON_ONCE(1);
                __sk_add_backlog(child, skb);
        }

^ permalink raw reply related

* RE: [PATCH V6 4/5] scsi: storvsc: Add Isolation VM support for storvsc driver
From: Long Li @ 2021-12-09  8:00 UTC (permalink / raw)
  To: Tianyu Lan, KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	wei.liu@kernel.org, Dexuan Cui, tglx@linutronix.de,
	mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com,
	x86@kernel.org, hpa@zytor.com, davem@davemloft.net,
	kuba@kernel.org, jejb@linux.ibm.com, martin.petersen@oracle.com,
	arnd@arndb.de, hch@infradead.org, m.szyprowski@samsung.com,
	robin.murphy@arm.com, Tianyu Lan, thomas.lendacky@amd.com,
	Michael Kelley (LINUX)
  Cc: iommu@lists.linux-foundation.org, linux-arch@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-scsi@vger.kernel.org, netdev@vger.kernel.org, vkuznets,
	brijesh.singh@amd.com, konrad.wilk@oracle.com, hch@lst.de,
	joro@8bytes.org, parri.andrea@gmail.com, dave.hansen@intel.com
In-Reply-To: <20211207075602.2452-5-ltykernel@gmail.com>

> Subject: [PATCH V6 4/5] scsi: storvsc: Add Isolation VM support for storvsc driver
> 
> From: Tianyu Lan <Tianyu.Lan@microsoft.com>
> 
> In Isolation VM, all shared memory with host needs to mark visible to host via
> hvcall. vmbus_establish_gpadl() has already done it for storvsc rx/tx ring buffer.
> The page buffer used by vmbus_sendpacket_
> mpb_desc() still needs to be handled. Use DMA API(scsi_dma_map/unmap) to
> map these memory during sending/receiving packet and return swiotlb bounce
> buffer dma address. In Isolation VM, swiotlb  bounce buffer is marked to be
> visible to host and the swiotlb force mode is enabled.
> 
> Set device's dma min align mask to HV_HYP_PAGE_SIZE - 1 in order to keep the
> original data offset in the bounce buffer.
> 
> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
> ---
>  drivers/hv/vmbus_drv.c     |  4 ++++
>  drivers/scsi/storvsc_drv.c | 37 +++++++++++++++++++++----------------
>  include/linux/hyperv.h     |  1 +
>  3 files changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index
> 392c1ac4f819..ae6ec503399a 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -33,6 +33,7 @@
>  #include <linux/random.h>
>  #include <linux/kernel.h>
>  #include <linux/syscore_ops.h>
> +#include <linux/dma-map-ops.h>
>  #include <clocksource/hyperv_timer.h>
>  #include "hyperv_vmbus.h"
> 
> @@ -2078,6 +2079,7 @@ struct hv_device *vmbus_device_create(const guid_t
> *type,
>  	return child_device_obj;
>  }
> 
> +static u64 vmbus_dma_mask = DMA_BIT_MASK(64);
>  /*
>   * vmbus_device_register - Register the child device
>   */
> @@ -2118,6 +2120,8 @@ int vmbus_device_register(struct hv_device
> *child_device_obj)
>  	}
>  	hv_debug_add_dev_dir(child_device_obj);
> 
> +	child_device_obj->device.dma_mask = &vmbus_dma_mask;
> +	child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
>  	return 0;
> 
>  err_kset_unregister:
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index
> 20595c0ba0ae..ae293600d799 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -21,6 +21,8 @@
>  #include <linux/device.h>
>  #include <linux/hyperv.h>
>  #include <linux/blkdev.h>
> +#include <linux/dma-mapping.h>
> +
>  #include <scsi/scsi.h>
>  #include <scsi/scsi_cmnd.h>
>  #include <scsi/scsi_host.h>
> @@ -1336,6 +1338,7 @@ static void storvsc_on_channel_callback(void
> *context)
>  					continue;
>  				}
>  				request = (struct storvsc_cmd_request
> *)scsi_cmd_priv(scmnd);
> +				scsi_dma_unmap(scmnd);
>  			}
> 
>  			storvsc_on_receive(stor_device, packet, request); @@
> -1749,7 +1752,6 @@ static int storvsc_queuecommand(struct Scsi_Host *host,
> struct scsi_cmnd *scmnd)
>  	struct hv_host_device *host_dev = shost_priv(host);
>  	struct hv_device *dev = host_dev->dev;
>  	struct storvsc_cmd_request *cmd_request = scsi_cmd_priv(scmnd);
> -	int i;
>  	struct scatterlist *sgl;
>  	unsigned int sg_count;
>  	struct vmscsi_request *vm_srb;
> @@ -1831,10 +1833,11 @@ static int storvsc_queuecommand(struct Scsi_Host
> *host, struct scsi_cmnd *scmnd)
>  	payload_sz = sizeof(cmd_request->mpb);
> 
>  	if (sg_count) {
> -		unsigned int hvpgoff, hvpfns_to_add;
>  		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
>  		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
> -		u64 hvpfn;
> +		struct scatterlist *sg;
> +		unsigned long hvpfn, hvpfns_to_add;
> +		int j, i = 0;
> 
>  		if (hvpg_count > MAX_PAGE_BUFFER_COUNT) {
> 
> @@ -1848,21 +1851,22 @@ static int storvsc_queuecommand(struct Scsi_Host
> *host, struct scsi_cmnd *scmnd)
>  		payload->range.len = length;
>  		payload->range.offset = offset_in_hvpg;
> 
> +		sg_count = scsi_dma_map(scmnd);
> +		if (sg_count < 0)
> +			return SCSI_MLQUEUE_DEVICE_BUSY;

Hi Tianyu,

This patch (and this patch series) unconditionally adds code for dealing with DMA addresses for all VMs, including non-isolation VMs.

Does this add performance penalty for VMs that don't require isolation?

Long

> 
> -		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
> +		for_each_sg(sgl, sg, sg_count, j) {
>  			/*
> -			 * Init values for the current sgl entry. hvpgoff
> -			 * and hvpfns_to_add are in units of Hyper-V size
> -			 * pages. Handling the PAGE_SIZE !=
> HV_HYP_PAGE_SIZE
> -			 * case also handles values of sgl->offset that are
> -			 * larger than PAGE_SIZE. Such offsets are handled
> -			 * even on other than the first sgl entry, provided
> -			 * they are a multiple of PAGE_SIZE.
> +			 * Init values for the current sgl entry. hvpfns_to_add
> +			 * is in units of Hyper-V size pages. Handling the
> +			 * PAGE_SIZE != HV_HYP_PAGE_SIZE case also handles
> +			 * values of sgl->offset that are larger than PAGE_SIZE.
> +			 * Such offsets are handled even on other than the first
> +			 * sgl entry, provided they are a multiple of PAGE_SIZE.
>  			 */
> -			hvpgoff = HVPFN_DOWN(sgl->offset);
> -			hvpfn = page_to_hvpfn(sg_page(sgl)) + hvpgoff;
> -			hvpfns_to_add =	HVPFN_UP(sgl->offset + sgl-
> >length) -
> -						hvpgoff;
> +			hvpfn = HVPFN_DOWN(sg_dma_address(sg));
> +			hvpfns_to_add = HVPFN_UP(sg_dma_address(sg) +
> +						 sg_dma_len(sg)) - hvpfn;
> 
>  			/*
>  			 * Fill the next portion of the PFN array with @@ -
> 1872,7 +1876,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host,
> struct scsi_cmnd *scmnd)
>  			 * the PFN array is filled.
>  			 */
>  			while (hvpfns_to_add--)
> -				payload->range.pfn_array[i++] =	hvpfn++;
> +				payload->range.pfn_array[i++] = hvpfn++;
>  		}
>  	}
> 
> @@ -2016,6 +2020,7 @@ static int storvsc_probe(struct hv_device *device,
>  	stor_device->vmscsi_size_delta = sizeof(struct vmscsi_win8_extension);
>  	spin_lock_init(&stor_device->lock);
>  	hv_set_drvdata(device, stor_device);
> +	dma_set_min_align_mask(&device->device, HV_HYP_PAGE_SIZE - 1);
> 
>  	stor_device->port_number = host->host_no;
>  	ret = storvsc_connect_to_vsp(device, storvsc_ringbuffer_size, is_fc);
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> 1f037e114dc8..74f5e92f91a0 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1261,6 +1261,7 @@ struct hv_device {
> 
>  	struct vmbus_channel *channel;
>  	struct kset	     *channels_kset;
> +	struct device_dma_parameters dma_parms;
> 
>  	/* place holder to keep track of the dir for hv device in debugfs */
>  	struct dentry *debug_dir;
> --
> 2.25.1


^ permalink raw reply

* Re: [PATCH] ath6kl: Use struct_group() to avoid size-mismatched casting
From: Kalle Valo @ 2021-12-09  8:00 UTC (permalink / raw)
  To: Kees Cook
  Cc: Kalle Valo, Kees Cook, David S. Miller, Jakub Kicinski,
	linux-kernel, linux-wireless, netdev, linux-hardening
In-Reply-To: <20211207063538.2767954-1-keescook@chromium.org>

Kees Cook <keescook@chromium.org> wrote:

> In builds with -Warray-bounds, casts from smaller objects to larger
> objects will produce warnings. These can be overly conservative, but since
> -Warray-bounds has been finding legitimate bugs, it is desirable to turn
> it on globally. Instead of casting a u32 to a larger object, redefine
> the u32 portion of the header to a separate struct that can be used for
> both u32 operations and the distinct header fields. Silences this warning:
> 
> drivers/net/wireless/ath/ath6kl/htc_mbox.c: In function 'htc_wait_for_ctrl_msg':
> drivers/net/wireless/ath/ath6kl/htc_mbox.c:2275:20: error: array subscript 'struct htc_frame_hdr[0]' is partly outside array bounds of 'u32[1]' {aka 'unsigned int[1]'} [-Werror=array-bounds]
>  2275 |         if (htc_hdr->eid != ENDPOINT_0)
>       |                    ^~
> drivers/net/wireless/ath/ath6kl/htc_mbox.c:2264:13: note: while referencing 'look_ahead'
>  2264 |         u32 look_ahead;
>       |             ^~~~~~~~~~
> 
> This change results in no executable instruction differences.
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Signed-off-by: Kalle Valo <quic_kvalo@quicinc.com>

Patch applied to ath-next branch of ath.git, thanks.

e3128a9d482c ath6kl: Use struct_group() to avoid size-mismatched casting

-- 
https://patchwork.kernel.org/project/linux-wireless/patch/20211207063538.2767954-1-keescook@chromium.org/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


^ permalink raw reply

* [PATCH net v2 3/3] net: openvswitch: Fix matching zone id for invalid conns arriving from tc
From: Paul Blakey @ 2021-12-09  7:57 UTC (permalink / raw)
  To: Paul Blakey, dev, netdev, Saeed Mahameed, Cong Wang,
	Jamal Hadi Salim, Pravin B Shelar, davem, Jiri Pirko, wenxu,
	Marcelo Ricardo Leitner
  Cc: Oz Shlomo, Vlad Buslov, Roi Dayan
In-Reply-To: <20211209075734.10199-1-paulb@nvidia.com>

Zone id is not restored if we passed ct and ct rejected the connection,
as there is no ct info on the skb.

Save the zone from tc skb cb to tc skb extension and pass it on to
ovs, use that info to restore the zone id for invalid connections.

Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
---
 include/linux/skbuff.h | 1 +
 net/openvswitch/flow.c | 8 +++++++-
 net/sched/cls_api.c    | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 155eb2ec54d8..28ad0c6bd0d5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -286,6 +286,7 @@ struct nf_bridge_info {
 struct tc_skb_ext {
 	__u32 chain;
 	__u16 mru;
+	__u16 zone;
 	bool post_ct;
 };
 #endif
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 9713035b89e3..6d262d9aa10e 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -34,6 +34,7 @@
 #include <net/mpls.h>
 #include <net/ndisc.h>
 #include <net/nsh.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -860,6 +861,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 #endif
 	bool post_ct = false;
 	int res, err;
+	u16 zone = 0;
 
 	/* Extract metadata from packet. */
 	if (tun_info) {
@@ -898,6 +900,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 		key->recirc_id = tc_ext ? tc_ext->chain : 0;
 		OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
 		post_ct = tc_ext ? tc_ext->post_ct : false;
+		zone = post_ct ? tc_ext->zone : 0;
 	} else {
 		key->recirc_id = 0;
 	}
@@ -906,8 +909,11 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 #endif
 
 	err = key_extract(skb, key);
-	if (!err)
+	if (!err) {
 		ovs_ct_fill_key(skb, key, post_ct);   /* Must be after key_extract(). */
+		if (post_ct && !skb_get_nfct(skb))
+			key->ct_zone = zone;
+	}
 	return err;
 }
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a5050999d607..bede2bd47065 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1625,6 +1625,7 @@ int tcf_classify(struct sk_buff *skb,
 		ext->chain = last_executed_chain;
 		ext->mru = cb->mru;
 		ext->post_ct = cb->post_ct;
+		ext->zone = cb->zone;
 	}
 
 	return ret;
-- 
2.30.1


^ permalink raw reply related

* [PATCH net v2 2/3] net/sched: flow_dissector: Fix matching on zone id for invalid conns
From: Paul Blakey @ 2021-12-09  7:57 UTC (permalink / raw)
  To: Paul Blakey, dev, netdev, Saeed Mahameed, Cong Wang,
	Jamal Hadi Salim, Pravin B Shelar, davem, Jiri Pirko, wenxu,
	Marcelo Ricardo Leitner
  Cc: Oz Shlomo, Vlad Buslov, Roi Dayan
In-Reply-To: <20211209075734.10199-1-paulb@nvidia.com>

If ct rejects a flow, it removes the conntrack info from the skb.
act_ct sets the post_ct variable so the dissector will see this case
as an +tracked +invalid state, but the zone id is lost with the
conntrack info.

To restore the zone id on such cases, set the last executed zone,
via the tc control block, when passing ct, and read it back in the
dissector if there is no ct info on the skb (invalid connection).

Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
---
 include/linux/skbuff.h    | 3 +--
 include/net/pkt_sched.h   | 1 +
 net/core/flow_dissector.c | 6 +++++-
 net/sched/act_ct.c        | 1 +
 net/sched/cls_flower.c    | 5 ++---
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8cb7e697d47..155eb2ec54d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1379,8 +1379,7 @@ void
 skb_flow_dissect_ct(const struct sk_buff *skb,
 		    struct flow_dissector *flow_dissector,
 		    void *target_container,
-		    u16 *ctinfo_map, size_t mapsize,
-		    bool post_ct);
+		    u16 *ctinfo_map, size_t mapsize);
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 05f18e81f3e8..9e71691c491b 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -198,6 +198,7 @@ struct tc_skb_cb {
 
 	u16 mru;
 	bool post_ct;
+	u16 zone; /* Only valid if post_ct = true */
 };
 
 static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3255f57f5131..b52a4370162b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -12,6 +12,7 @@
 #include <net/gre.h>
 #include <net/pptp.h>
 #include <net/tipc.h>
+#include <net/pkt_sched.h>
 #include <linux/igmp.h>
 #include <linux/icmp.h>
 #include <linux/sctp.h>
@@ -238,10 +239,12 @@ void
 skb_flow_dissect_ct(const struct sk_buff *skb,
 		    struct flow_dissector *flow_dissector,
 		    void *target_container, u16 *ctinfo_map,
-		    size_t mapsize, bool post_ct)
+		    size_t mapsize)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	bool post_ct = tc_skb_cb(skb)->post_ct;
 	struct flow_dissector_key_ct *key;
+	u16 zone = tc_skb_cb(skb)->zone;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_labels *cl;
 	struct nf_conn *ct;
@@ -260,6 +263,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
 	if (!ct) {
 		key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
 				TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+		key->ct_zone = zone;
 		return;
 	}
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 98e248b9c0b1..ab3591408419 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1049,6 +1049,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	skb_push_rcsum(skb, nh_ofs);
 
 	tc_skb_cb(skb)->post_ct = true;
+	tc_skb_cb(skb)->zone = p->zone;
 out_clear:
 	if (defrag)
 		qdisc_skb_cb(skb)->pkt_len = skb->len;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9782b93db1b3..c1a017822c6e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -310,7 +310,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	bool post_ct = tc_skb_cb(skb)->post_ct;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
 	struct cls_fl_filter *f;
@@ -327,8 +326,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
 		skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
 				    fl_ct_info_to_flower_map,
-				    ARRAY_SIZE(fl_ct_info_to_flower_map),
-				    post_ct);
+				    ARRAY_SIZE(fl_ct_info_to_flower_map));
+
 		skb_flow_dissect_hash(skb, &mask->dissector, &skb_key);
 		skb_flow_dissect(skb, &mask->dissector, &skb_key,
 				 FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP);
-- 
2.30.1


^ permalink raw reply related

* [PATCH net v2 1/3] net/sched: Extend qdisc control block with tc control block
From: Paul Blakey @ 2021-12-09  7:57 UTC (permalink / raw)
  To: Paul Blakey, dev, netdev, Saeed Mahameed, Cong Wang,
	Jamal Hadi Salim, Pravin B Shelar, davem, Jiri Pirko, wenxu,
	Marcelo Ricardo Leitner
  Cc: Oz Shlomo, Vlad Buslov, Roi Dayan
In-Reply-To: <20211209075734.10199-1-paulb@nvidia.com>

BPF layer extends the qdisc control block via struct bpf_skb_data_end
and because of that there is no more room to add variables to the
qdisc layer control block without going over the skb->cb size.

Extend the qdisc control block with a tc control block,
and move all tc related variables to there as a pre-step for
extending the tc control block with additional members.

Signed-off-by: Paul Blakey <paulb@nvidia.com>
---
 include/net/pkt_sched.h   | 15 +++++++++++++++
 include/net/sch_generic.h |  2 --
 net/core/dev.c            |  8 ++++----
 net/sched/act_ct.c        | 14 +++++++-------
 net/sched/cls_api.c       |  6 ++++--
 net/sched/cls_flower.c    |  3 ++-
 net/sched/sch_frag.c      |  3 ++-
 7 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index bf79f3a890af..05f18e81f3e8 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -193,4 +193,19 @@ static inline void skb_txtime_consumed(struct sk_buff *skb)
 	skb->tstamp = ktime_set(0, 0);
 }
 
+struct tc_skb_cb {
+	struct qdisc_skb_cb qdisc_cb;
+
+	u16 mru;
+	bool post_ct;
+};
+
+static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
+{
+	struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb;
+
+	BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
+	return cb;
+}
+
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 22179b2fda72..c70e6d2b2fdd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -447,8 +447,6 @@ struct qdisc_skb_cb {
 	};
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
-	u16			mru;
-	bool			post_ct;
 };
 
 typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2a352e668d10..c4708e2487fb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3941,8 +3941,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 		return skb;
 
 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
-	qdisc_skb_cb(skb)->mru = 0;
-	qdisc_skb_cb(skb)->post_ct = false;
+	tc_skb_cb(skb)->mru = 0;
+	tc_skb_cb(skb)->post_ct = false;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
 	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
@@ -5103,8 +5103,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 	}
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
-	qdisc_skb_cb(skb)->mru = 0;
-	qdisc_skb_cb(skb)->post_ct = false;
+	tc_skb_cb(skb)->mru = 0;
+	tc_skb_cb(skb)->post_ct = false;
 	skb->tc_at_ingress = 1;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 90866ae45573..98e248b9c0b1 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -690,10 +690,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 				   u8 family, u16 zone, bool *defrag)
 {
 	enum ip_conntrack_info ctinfo;
-	struct qdisc_skb_cb cb;
 	struct nf_conn *ct;
 	int err = 0;
 	bool frag;
+	u16 mru;
 
 	/* Previously seen (loopback)? Ignore. */
 	ct = nf_ct_get(skb, &ctinfo);
@@ -708,7 +708,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 		return err;
 
 	skb_get(skb);
-	cb = *qdisc_skb_cb(skb);
+	mru = tc_skb_cb(skb)->mru;
 
 	if (family == NFPROTO_IPV4) {
 		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
@@ -722,7 +722,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 
 		if (!err) {
 			*defrag = true;
-			cb.mru = IPCB(skb)->frag_max_size;
+			mru = IPCB(skb)->frag_max_size;
 		}
 	} else { /* NFPROTO_IPV6 */
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
@@ -735,7 +735,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 
 		if (!err) {
 			*defrag = true;
-			cb.mru = IP6CB(skb)->frag_max_size;
+			mru = IP6CB(skb)->frag_max_size;
 		}
 #else
 		err = -EOPNOTSUPP;
@@ -744,7 +744,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 	}
 
 	if (err != -EINPROGRESS)
-		*qdisc_skb_cb(skb) = cb;
+		tc_skb_cb(skb)->mru = mru;
 	skb_clear_hash(skb);
 	skb->ignore_df = 1;
 	return err;
@@ -963,7 +963,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	tcf_action_update_bstats(&c->common, skb);
 
 	if (clear) {
-		qdisc_skb_cb(skb)->post_ct = false;
+		tc_skb_cb(skb)->post_ct = false;
 		ct = nf_ct_get(skb, &ctinfo);
 		if (ct) {
 			nf_conntrack_put(&ct->ct_general);
@@ -1048,7 +1048,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-	qdisc_skb_cb(skb)->post_ct = true;
+	tc_skb_cb(skb)->post_ct = true;
 out_clear:
 	if (defrag)
 		qdisc_skb_cb(skb)->pkt_len = skb->len;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2ef8f5a6205a..a5050999d607 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1617,12 +1617,14 @@ int tcf_classify(struct sk_buff *skb,
 
 	/* If we missed on some chain */
 	if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+		struct tc_skb_cb *cb = tc_skb_cb(skb);
+
 		ext = tc_skb_ext_alloc(skb);
 		if (WARN_ON_ONCE(!ext))
 			return TC_ACT_SHOT;
 		ext->chain = last_executed_chain;
-		ext->mru = qdisc_skb_cb(skb)->mru;
-		ext->post_ct = qdisc_skb_cb(skb)->post_ct;
+		ext->mru = cb->mru;
+		ext->post_ct = cb->post_ct;
 	}
 
 	return ret;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index aab13ba11767..9782b93db1b3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -19,6 +19,7 @@
 
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
 #include <net/ip.h>
 #include <net/flow_dissector.h>
 #include <net/geneve.h>
@@ -309,7 +310,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	bool post_ct = qdisc_skb_cb(skb)->post_ct;
+	bool post_ct = tc_skb_cb(skb)->post_ct;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
 	struct cls_fl_filter *f;
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index 8c06381391d6..5ded4c8672a6 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 #include <net/netlink.h>
 #include <net/sch_generic.h>
+#include <net/pkt_sched.h>
 #include <net/dst.h>
 #include <net/ip.h>
 #include <net/ip6_fib.h>
@@ -137,7 +138,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
 
 int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
 {
-	u16 mru = qdisc_skb_cb(skb)->mru;
+	u16 mru = tc_skb_cb(skb)->mru;
 	int err;
 
 	if (mru && skb->len > mru + skb->dev->hard_header_len)
-- 
2.30.1


^ permalink raw reply related

* [PATCH net v2 0/3] net/sched: Fix ct zone matching for invalid conntrack state
From: Paul Blakey @ 2021-12-09  7:57 UTC (permalink / raw)
  To: Paul Blakey, dev, netdev, Saeed Mahameed, Cong Wang,
	Jamal Hadi Salim, Pravin B Shelar, davem, Jiri Pirko, wenxu,
	Marcelo Ricardo Leitner
  Cc: Oz Shlomo, Vlad Buslov, Roi Dayan

Hi,

Currently, when a packet is marked as invalid conntrack_in in act_ct,
post_ct will be set, and connection info (nf_conn) will be removed
from the skb. Later openvswitch and flower matching will parse this
as ct_state=+trk+inv. But because the connection info is missing,
there is also no zone info to match against even though the packet
is tracked.

This series fixes that, by passing the last executed zone by act_ct.
The zone info is passed along from act_ct to the ct flow dissector
(used by flower to extract zone info) and to ovs, the same way as post_ct
is passed, via qdisc layer skb cb to dissector, and via skb extension
to OVS.

Since adding any more data to qdisc skb cb, there will be no room 
for BPF skb cb to extend it and stay under skb->cb size, this series
moves the tc related info from within qdisc skb cb to a tc specific cb
that also extends it.

---
Changelog:
	1->2:
	  Cover letter wording
	  Added blamed CCs

Paul Blakey (3):
  net/sched: Extend qdisc control block with tc control block
  net/sched: flow_dissector: Fix matching on zone id for invalid conns
  net: openvswitch: Fix matching zone id for invalid conns arriving from tc

 include/linux/skbuff.h    |  4 ++--
 include/net/pkt_sched.h   | 16 ++++++++++++++++
 include/net/sch_generic.h |  2 --
 net/core/dev.c            |  8 ++++----
 net/core/flow_dissector.c |  6 +++++-
 net/openvswitch/flow.c    |  8 +++++++-
 net/sched/act_ct.c        | 15 ++++++++-------
 net/sched/cls_api.c       |  7 +++++--
 net/sched/cls_flower.c    |  6 +++---
 net/sched/sch_frag.c      |  3 ++-
 10 files changed, 52 insertions(+), 23 deletions(-)

-- 
2.30.1


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox