Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next iproute2 v2 2/3] tc: Introduce tc ct action
From: Paul Blakey @ 2019-07-11  8:14 UTC (permalink / raw)
  To: Jiri Pirko, Paul Blakey, Roi Dayan, Yossi Kuperman, Oz Shlomo,
	Marcelo Ricardo Leitner, netdev, David Miller, Aaron Conole,
	Zhike Wang
  Cc: Rony Efraim, nst-kernel, John Hurley, Simon Horman, Justin Pettit
In-Reply-To: <1562832867-32347-1-git-send-email-paulb@mellanox.com>

New tc action to send packets to conntrack module, commit
them, and set a zone, labels, mark, and nat on the connection.

It can also clear the packet's conntrack state by using clear.

Usage:
   ct clear
   ct commit [force] [zone] [mark] [label] [nat]
   ct [nat] [zone]

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Roi Dayan <roid@mellanox.com>
---
 include/uapi/linux/tc_act/tc_ct.h |  41 ++++
 tc/Makefile                       |   1 +
 tc/m_ct.c                         | 497 ++++++++++++++++++++++++++++++++++++++
 tc/tc_util.c                      |  44 ++++
 tc/tc_util.h                      |   4 +
 5 files changed, 587 insertions(+)
 create mode 100644 include/uapi/linux/tc_act/tc_ct.h
 create mode 100644 tc/m_ct.c

diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
new file mode 100644
index 0000000..5fb1d7a
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CT_H
+#define __UAPI_TC_CT_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+enum {
+	TCA_CT_UNSPEC,
+	TCA_CT_PARMS,
+	TCA_CT_TM,
+	TCA_CT_ACTION,		/* u16 */
+	TCA_CT_ZONE,		/* u16 */
+	TCA_CT_MARK,		/* u32 */
+	TCA_CT_MARK_MASK,	/* u32 */
+	TCA_CT_LABELS,		/* u128 */
+	TCA_CT_LABELS_MASK,	/* u128 */
+	TCA_CT_NAT_IPV4_MIN,	/* be32 */
+	TCA_CT_NAT_IPV4_MAX,	/* be32 */
+	TCA_CT_NAT_IPV6_MIN,	/* struct in6_addr */
+	TCA_CT_NAT_IPV6_MAX,	/* struct in6_addr */
+	TCA_CT_NAT_PORT_MIN,	/* be16 */
+	TCA_CT_NAT_PORT_MAX,	/* be16 */
+	TCA_CT_PAD,
+	__TCA_CT_MAX
+};
+
+#define TCA_CT_MAX (__TCA_CT_MAX - 1)
+
+#define TCA_CT_ACT_COMMIT	(1 << 0)
+#define TCA_CT_ACT_FORCE	(1 << 1)
+#define TCA_CT_ACT_CLEAR	(1 << 2)
+#define TCA_CT_ACT_NAT		(1 << 3)
+#define TCA_CT_ACT_NAT_SRC	(1 << 4)
+#define TCA_CT_ACT_NAT_DST	(1 << 5)
+
+struct tc_ct {
+	tc_gen;
+};
+
+#endif /* __UAPI_TC_CT_H */
diff --git a/tc/Makefile b/tc/Makefile
index 09ff369..14171a2 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -53,6 +53,7 @@ TCMODULES += m_ctinfo.o
 TCMODULES += m_bpf.o
 TCMODULES += m_tunnel_key.o
 TCMODULES += m_sample.o
+TCMODULES += m_ct.o
 TCMODULES += p_ip.o
 TCMODULES += p_ip6.o
 TCMODULES += p_icmp.o
diff --git a/tc/m_ct.c b/tc/m_ct.c
new file mode 100644
index 0000000..8589cb9
--- /dev/null
+++ b/tc/m_ct.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * m_ct.c     Connection tracking action
+ *
+ * Authors:   Paul Blakey <paulb@mellanox.com>
+ *            Yossi Kuperman <yossiku@mellanox.com>
+ *            Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include "utils.h"
+#include "tc_util.h"
+#include <linux/tc_act/tc_ct.h>
+
+static void
+usage(void)
+{
+	fprintf(stderr,
+		"Usage: ct clear\n"
+		"	ct commit [force] [zone ZONE] [mark MASKED_MARK] [label MASKED_LABEL] [nat NAT_SPEC]\n"
+		"	ct [nat] [zone ZONE]\n"
+		"Where: ZONE is the conntrack zone table number\n"
+		"	NAT_SPEC is {src|dst} addr addr1[-addr2] [port port1[-port2]]\n"
+		"\n");
+	exit(-1);
+}
+
+static int ct_parse_nat_addr_range(const char *str, struct nlmsghdr *n)
+{
+	inet_prefix addr = { .family = AF_UNSPEC, };
+	char *addr1, *addr2 = 0;
+	SPRINT_BUF(buffer);
+	int attr;
+	int ret;
+
+	strncpy(buffer, str, sizeof(buffer) - 1);
+
+	addr1 = buffer;
+	addr2 = strchr(addr1, '-');
+	if (addr2) {
+		*addr2 = '\0';
+		addr2++;
+	}
+
+	ret = get_addr(&addr, addr1, AF_UNSPEC);
+	if (ret)
+		return ret;
+	attr = addr.family == AF_INET ? TCA_CT_NAT_IPV4_MIN :
+					TCA_CT_NAT_IPV6_MIN;
+	addattr_l(n, MAX_MSG, attr, addr.data, addr.bytelen);
+
+	if (addr2) {
+		ret = get_addr(&addr, addr2, addr.family);
+		if (ret)
+			return ret;
+	}
+	attr = addr.family == AF_INET ? TCA_CT_NAT_IPV4_MAX :
+					TCA_CT_NAT_IPV6_MAX;
+	addattr_l(n, MAX_MSG, attr, addr.data, addr.bytelen);
+
+	return 0;
+}
+
+static int ct_parse_nat_port_range(const char *str, struct nlmsghdr *n)
+{
+	char *port1, *port2 = 0;
+	SPRINT_BUF(buffer);
+	__be16 port;
+	int ret;
+
+	strncpy(buffer, str, sizeof(buffer) - 1);
+
+	port1 = buffer;
+	port2 = strchr(port1, '-');
+	if (port2) {
+		*port2 = '\0';
+		port2++;
+	}
+
+	ret = get_be16(&port, port1, 10);
+	if (ret)
+		return -1;
+	addattr16(n, MAX_MSG, TCA_CT_NAT_PORT_MIN, port);
+
+	if (port2) {
+		ret = get_be16(&port, port2, 10);
+		if (ret)
+			return -1;
+	}
+	addattr16(n, MAX_MSG, TCA_CT_NAT_PORT_MAX, port);
+
+	return 0;
+}
+
+
+static int ct_parse_u16(char *str, int value_type, int mask_type,
+			struct nlmsghdr *n)
+{
+	__u16 value, mask;
+	char *slash = 0;
+
+	if (mask_type != TCA_CT_UNSPEC) {
+		slash = strchr(str, '/');
+		if (slash)
+			*slash = '\0';
+	}
+
+	if (get_u16(&value, str, 0))
+		return -1;
+
+	if (slash) {
+		if (get_u16(&mask, slash + 1, 0))
+			return -1;
+	} else {
+		mask = UINT16_MAX;
+	}
+
+	addattr16(n, MAX_MSG, value_type, value);
+	if (mask_type != TCA_CT_UNSPEC)
+		addattr16(n, MAX_MSG, mask_type, mask);
+
+	return 0;
+}
+
+static int ct_parse_u32(char *str, int value_type, int mask_type,
+			struct nlmsghdr *n)
+{
+	__u32 value, mask;
+	char *slash;
+
+	slash = strchr(str, '/');
+	if (slash)
+		*slash = '\0';
+
+	if (get_u32(&value, str, 0))
+		return -1;
+
+	if (slash) {
+		if (get_u32(&mask, slash + 1, 0))
+			return -1;
+	} else {
+		mask = UINT32_MAX;
+	}
+
+	addattr32(n, MAX_MSG, value_type, value);
+	addattr32(n, MAX_MSG, mask_type, mask);
+
+	return 0;
+}
+
+static int ct_parse_mark(char *str, struct nlmsghdr *n)
+{
+	return ct_parse_u32(str, TCA_CT_MARK, TCA_CT_MARK_MASK, n);
+}
+
+static int ct_parse_labels(char *str, struct nlmsghdr *n)
+{
+#define LABELS_SIZE	16
+	uint8_t labels[LABELS_SIZE], lmask[LABELS_SIZE];
+	char *slash, *mask = NULL;
+	size_t slen, slen_mask = 0;
+
+	slash = index(str, '/');
+	if (slash) {
+		*slash = 0;
+		mask = slash+1;
+		slen_mask = strlen(mask);
+	}
+
+	slen = strlen(str);
+	if (slen > LABELS_SIZE*2 || slen_mask > LABELS_SIZE*2) {
+		char errmsg[128];
+
+		snprintf(errmsg, sizeof(errmsg),
+				"%zd Max allowed size %d",
+				slen, LABELS_SIZE*2);
+		invarg(errmsg, str);
+	}
+
+	if (hex2mem(str, labels, slen/2) < 0)
+		invarg("ct: labels must be a hex string\n", str);
+	addattr_l(n, MAX_MSG, TCA_CT_LABELS, labels, slen/2);
+
+	if (mask) {
+		if (hex2mem(mask, lmask, slen_mask/2) < 0)
+			invarg("ct: labels mask must be a hex string\n", mask);
+	} else {
+		memset(lmask, 0xff, sizeof(lmask));
+		slen_mask = sizeof(lmask)*2;
+	}
+	addattr_l(n, MAX_MSG, TCA_CT_LABELS_MASK, lmask, slen_mask/2);
+
+	return 0;
+}
+
+static int
+parse_ct(struct action_util *a, int *argc_p, char ***argv_p, int tca_id,
+		struct nlmsghdr *n)
+{
+	struct tc_ct sel = {};
+	char **argv = *argv_p;
+	struct rtattr *tail;
+	int argc = *argc_p;
+	int ct_action = 0;
+	int ret;
+
+	tail = addattr_nest(n, MAX_MSG, tca_id);
+
+	if (argc && matches(*argv, "ct") == 0)
+		NEXT_ARG_FWD();
+
+	while (argc > 0) {
+		if (matches(*argv, "zone") == 0) {
+			NEXT_ARG();
+
+			if (ct_parse_u16(*argv,
+					 TCA_CT_ZONE, TCA_CT_UNSPEC, n)) {
+				fprintf(stderr, "ct: Illegal \"zone\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "nat") == 0) {
+			ct_action |= TCA_CT_ACT_NAT;
+
+			NEXT_ARG();
+			if (matches(*argv, "src") == 0)
+				ct_action |= TCA_CT_ACT_NAT_SRC;
+			else if (matches(*argv, "dst") == 0)
+				ct_action |= TCA_CT_ACT_NAT_DST;
+			else
+				continue;
+
+			NEXT_ARG();
+			if (matches(*argv, "addr") != 0)
+				usage();
+
+			NEXT_ARG();
+			ret = ct_parse_nat_addr_range(*argv, n);
+			if (ret) {
+				fprintf(stderr, "ct: Illegal nat address range\n");
+				return -1;
+			}
+
+			NEXT_ARG_FWD();
+			if (matches(*argv, "port") != 0)
+				continue;
+
+			NEXT_ARG();
+			ret = ct_parse_nat_port_range(*argv, n);
+			if (ret) {
+				fprintf(stderr, "ct: Illegal nat port range\n");
+				return -1;
+			}
+		} else if (matches(*argv, "clear") == 0) {
+			ct_action |= TCA_CT_ACT_CLEAR;
+		} else if (matches(*argv, "commit") == 0) {
+			ct_action |= TCA_CT_ACT_COMMIT;
+		} else if (matches(*argv, "force") == 0) {
+			ct_action |= TCA_CT_ACT_FORCE;
+		} else if (matches(*argv, "index") == 0) {
+			NEXT_ARG();
+			if (get_u32(&sel.index, *argv, 10)) {
+				fprintf(stderr, "ct: Illegal \"index\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "mark") == 0) {
+			NEXT_ARG();
+
+			ret = ct_parse_mark(*argv, n);
+			if (ret) {
+				fprintf(stderr, "ct: Illegal \"mark\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "label") == 0) {
+			NEXT_ARG();
+
+			ret = ct_parse_labels(*argv, n);
+			if (ret) {
+				fprintf(stderr, "ct: Illegal \"label\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "help") == 0) {
+			usage();
+		} else {
+			break;
+		}
+		NEXT_ARG_FWD();
+	}
+
+	if (ct_action & TCA_CT_ACT_CLEAR &&
+	    ct_action & ~TCA_CT_ACT_CLEAR) {
+		fprintf(stderr, "ct: clear can only be used alone\n");
+		return -1;
+	}
+
+	if (ct_action & TCA_CT_ACT_NAT_SRC &&
+	    ct_action & TCA_CT_ACT_NAT_DST) {
+		fprintf(stderr, "ct: src and dst nat can't be used together\n");
+		return -1;
+	}
+
+	if ((ct_action & TCA_CT_ACT_COMMIT) &&
+	    (ct_action & TCA_CT_ACT_NAT) &&
+	    !(ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) {
+		fprintf(stderr, "ct: commit and nat must set src or dst\n");
+		return -1;
+	}
+
+	if (!(ct_action & TCA_CT_ACT_COMMIT) &&
+	    (ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) {
+		fprintf(stderr, "ct: src or dst is only valid if commit is set\n");
+		return -1;
+	}
+
+	parse_action_control_dflt(&argc, &argv, &sel.action, false,
+				  TC_ACT_PIPE);
+	NEXT_ARG_FWD();
+
+	addattr16(n, MAX_MSG, TCA_CT_ACTION, ct_action);
+	addattr_l(n, MAX_MSG, TCA_CT_PARMS, &sel, sizeof(sel));
+	addattr_nest_end(n, tail);
+
+	*argc_p = argc;
+	*argv_p = argv;
+	return 0;
+}
+
+static int ct_sprint_port(char *buf, const char *prefix, struct rtattr *attr)
+{
+	if (!attr)
+		return 0;
+
+	return sprintf(buf, "%s%d", prefix, rta_getattr_be16(attr));
+}
+
+static int ct_sprint_ip_addr(char *buf, const char *prefix,
+			     struct rtattr *attr)
+{
+	int family;
+	size_t len;
+
+	if (!attr)
+		return 0;
+
+	len = RTA_PAYLOAD(attr);
+
+	if (len == 4)
+		family = AF_INET;
+	else if (len == 16)
+		family = AF_INET6;
+	else
+		return 0;
+
+	return sprintf(buf, "%s%s", prefix, rt_addr_n2a_rta(family, attr));
+}
+
+static void ct_print_nat(int ct_action, struct rtattr **tb)
+{
+	size_t done = 0;
+	char out[256] = "";
+	bool nat;
+
+	if (!(ct_action & TCA_CT_ACT_NAT))
+		return;
+
+	if (ct_action & TCA_CT_ACT_NAT_SRC) {
+		nat = true;
+		done += sprintf(out + done, "src");
+	} else if (ct_action & TCA_CT_ACT_NAT_DST) {
+		nat = true;
+		done += sprintf(out + done, "dst");
+	}
+
+	if (nat) {
+		done += ct_sprint_ip_addr(out + done, " addr ",
+					  tb[TCA_CT_NAT_IPV4_MIN]);
+		done += ct_sprint_ip_addr(out + done, " addr ",
+					  tb[TCA_CT_NAT_IPV6_MIN]);
+		if (tb[TCA_CT_NAT_IPV4_MAX] &&
+		    memcmp(RTA_DATA(tb[TCA_CT_NAT_IPV4_MIN]),
+			   RTA_DATA(tb[TCA_CT_NAT_IPV4_MAX]), 4))
+			done += ct_sprint_ip_addr(out + done, "-",
+						  tb[TCA_CT_NAT_IPV4_MAX]);
+		else if (tb[TCA_CT_NAT_IPV6_MAX] &&
+			    memcmp(RTA_DATA(tb[TCA_CT_NAT_IPV6_MIN]),
+				   RTA_DATA(tb[TCA_CT_NAT_IPV6_MAX]), 16))
+			done += ct_sprint_ip_addr(out + done, "-",
+						  tb[TCA_CT_NAT_IPV6_MAX]);
+		done += ct_sprint_port(out + done, " port ",
+				       tb[TCA_CT_NAT_PORT_MIN]);
+		if (tb[TCA_CT_NAT_PORT_MAX] &&
+		    memcmp(RTA_DATA(tb[TCA_CT_NAT_PORT_MIN]),
+			   RTA_DATA(tb[TCA_CT_NAT_PORT_MAX]), 2))
+			done += ct_sprint_port(out + done, "-",
+					       tb[TCA_CT_NAT_PORT_MAX]);
+	}
+
+	if (done)
+		print_string(PRINT_ANY, "nat", " nat %s", out);
+	else
+		print_string(PRINT_ANY, "nat", " nat", "");
+}
+
+static void ct_print_labels(struct rtattr *attr,
+			    struct rtattr *mask_attr)
+{
+	const unsigned char *str;
+	bool print_mask = false;
+	char out[256], *p;
+	int data_len, i;
+
+	if (!attr)
+		return;
+
+	data_len = RTA_PAYLOAD(attr);
+	hexstring_n2a(RTA_DATA(attr), data_len, out, sizeof(out));
+	p = out + data_len*2;
+
+	data_len = RTA_PAYLOAD(attr);
+	str = RTA_DATA(mask_attr);
+	if (data_len != 16)
+		print_mask = true;
+	for (i = 0; !print_mask && i < data_len; i++) {
+		if (str[i] != 0xff)
+			print_mask = true;
+	}
+	if (print_mask) {
+		*p++ = '/';
+		hexstring_n2a(RTA_DATA(mask_attr), data_len, p,
+			      sizeof(out)-(p-out));
+		p += data_len*2;
+	}
+	*p = '\0';
+
+	print_string(PRINT_ANY, "label", " label %s", out);
+}
+
+static int print_ct(struct action_util *au, FILE *f, struct rtattr *arg)
+{
+	struct rtattr *tb[TCA_CT_MAX + 1];
+	const char *commit;
+	struct tc_ct *p;
+	int ct_action = 0;
+
+	if (arg == NULL)
+		return -1;
+
+	parse_rtattr_nested(tb, TCA_CT_MAX, arg);
+	if (tb[TCA_CT_PARMS] == NULL) {
+		print_string(PRINT_FP, NULL, "%s", "[NULL ct parameters]");
+		return -1;
+	}
+
+	p = RTA_DATA(tb[TCA_CT_PARMS]);
+
+	print_string(PRINT_ANY, "kind", "%s", "ct");
+
+	if (tb[TCA_CT_ACTION])
+		ct_action = rta_getattr_u16(tb[TCA_CT_ACTION]);
+	if (ct_action & TCA_CT_ACT_COMMIT) {
+		commit = ct_action & TCA_CT_ACT_FORCE ?
+			 "commit force" : "commit";
+		print_string(PRINT_ANY, "action", " %s", commit);
+	} else if (ct_action & TCA_CT_ACT_CLEAR) {
+		print_string(PRINT_ANY, "action", " %s", "clear");
+	}
+
+	print_masked_u32("mark", tb[TCA_CT_MARK], tb[TCA_CT_MARK_MASK]);
+	print_masked_u16("zone", tb[TCA_CT_ZONE], NULL);
+	ct_print_labels(tb[TCA_CT_LABELS], tb[TCA_CT_LABELS_MASK]);
+	ct_print_nat(ct_action, tb);
+
+	print_action_control(f, " ", p->action, "");
+
+	print_uint(PRINT_ANY, "index", "\n\t index %u", p->index);
+	print_int(PRINT_ANY, "ref", " ref %d", p->refcnt);
+	print_int(PRINT_ANY, "bind", " bind %d", p->bindcnt);
+
+	if (show_stats) {
+		if (tb[TCA_CT_TM]) {
+			struct tcf_t *tm = RTA_DATA(tb[TCA_CT_TM]);
+
+			print_tm(f, tm);
+		}
+	}
+	print_string(PRINT_FP, NULL, "%s", "\n ");
+
+	return 0;
+}
+
+struct action_util ct_action_util = {
+	.id = "ct",
+	.parse_aopt = parse_ct,
+	.print_aopt = print_ct,
+};
diff --git a/tc/tc_util.c b/tc/tc_util.c
index 53d15e0..8e461ba 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -913,3 +913,47 @@ compat_xstats:
 	if (tb[TCA_XSTATS] && xstats)
 		*xstats = tb[TCA_XSTATS];
 }
+
+void print_masked_u32(const char *name, struct rtattr *attr,
+		      struct rtattr *mask_attr)
+{
+	__u32 value, mask;
+	SPRINT_BUF(namefrm);
+	SPRINT_BUF(out);
+	size_t done;
+
+	if (!attr)
+		return;
+
+	value = rta_getattr_u32(attr);
+	mask = mask_attr ? rta_getattr_u32(mask_attr) : UINT32_MAX;
+
+	done = sprintf(out, "%u", value);
+	if (mask != UINT32_MAX)
+		sprintf(out + done, "/0x%x", mask);
+
+	sprintf(namefrm, " %s %%s", name);
+	print_string(PRINT_ANY, name, namefrm, out);
+}
+
+void print_masked_u16(const char *name, struct rtattr *attr,
+		      struct rtattr *mask_attr)
+{
+	__u16 value, mask;
+	SPRINT_BUF(namefrm);
+	SPRINT_BUF(out);
+	size_t done;
+
+	if (!attr)
+		return;
+
+	value = rta_getattr_u16(attr);
+	mask = mask_attr ? rta_getattr_u16(mask_attr) : UINT16_MAX;
+
+	done = sprintf(out, "%u", value);
+	if (mask != UINT16_MAX)
+		sprintf(out + done, "/0x%x", mask);
+
+	sprintf(namefrm, " %s %%s", name);
+	print_string(PRINT_ANY, name, namefrm, out);
+}
diff --git a/tc/tc_util.h b/tc/tc_util.h
index eb4b60d..0c3425a 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -127,4 +127,8 @@ int action_a2n(char *arg, int *result, bool allow_num);
 
 bool tc_qdisc_block_exists(__u32 block_index);
 
+void print_masked_u32(const char *name, struct rtattr *attr,
+		      struct rtattr *mask_attr);
+void print_masked_u16(const char *name, struct rtattr *attr,
+		      struct rtattr *mask_attr);
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next iproute2 v2 1/3] tc: add NLA_F_NESTED flag to all actions options nested block
From: Paul Blakey @ 2019-07-11  8:14 UTC (permalink / raw)
  To: Jiri Pirko, Paul Blakey, Roi Dayan, Yossi Kuperman, Oz Shlomo,
	Marcelo Ricardo Leitner, netdev, David Miller, Aaron Conole,
	Zhike Wang
  Cc: Rony Efraim, nst-kernel, John Hurley, Simon Horman, Justin Pettit
In-Reply-To: <1562832867-32347-1-git-send-email-paulb@mellanox.com>

Strict netlink validation now requires this flag on all nested
attributes, add it for action options.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
 tc/m_action.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index ab6bc0a..2d36a69 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -214,7 +214,8 @@ done0:
 			tail = addattr_nest(n, MAX_MSG, ++prio);
 			addattr_l(n, MAX_MSG, TCA_ACT_KIND, k, strlen(k) + 1);
 
-			ret = a->parse_aopt(a, &argc, &argv, TCA_ACT_OPTIONS,
+			ret = a->parse_aopt(a, &argc, &argv,
+					    TCA_ACT_OPTIONS | NLA_F_NESTED,
 					    n);
 
 			if (ret < 0) {
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next iproute2 v2 0/3] net/sched: Introduce tc connection tracking
From: Paul Blakey @ 2019-07-11  8:14 UTC (permalink / raw)
  To: Jiri Pirko, Paul Blakey, Roi Dayan, Yossi Kuperman, Oz Shlomo,
	Marcelo Ricardo Leitner, netdev, David Miller, Aaron Conole,
	Zhike Wang
  Cc: Rony Efraim, nst-kernel, John Hurley, Simon Horman, Justin Pettit

Hi,

This patch series add connection tracking capabilities in tc.
It does so via a new tc action, called act_ct, and new tc flower classifier matching.
Act ct and relevant flower matches, are still under review in net-next mailing list.

Usage is as follows:
$ tc qdisc add dev ens1f0_0 ingress
$ tc qdisc add dev ens1f0_1 ingress

$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 0 proto ip \
  flower ip_proto tcp ct_state -trk \
  action ct zone 2 pipe \
  action goto chain 2
$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 2 proto ip \
  flower ct_state +trk+new \
  action ct zone 2 commit mark 0xbb nat src addr 5.5.5.7 pipe \
  action mirred egress redirect dev ens1f0_1
$ tc filter add dev ens1f0_0 ingress \
  prio 1 chain 2 proto ip \
  flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \
  action ct nat pipe \
  action mirred egress redirect dev ens1f0_1

$ tc filter add dev ens1f0_1 ingress \
  prio 1 chain 0 proto ip \
  flower ip_proto tcp ct_state -trk \
  action ct zone 2 pipe \
  action goto chain 1
$ tc filter add dev ens1f0_1 ingress \
  prio 1 chain 1 proto ip \
  flower ct_zone 2 ct_mark 0xbb ct_state +trk+est \
  action ct nat pipe \
  action mirred egress redirect dev ens1f0_0

Changelog:
V1->V2:
	Removed pkt_cls changes (as it was merged already)

Paul Blakey (3):
  tc: add NLA_F_NESTED flag to all actions options nested block
  tc: Introduce tc ct action
  tc: flower: Add matching on conntrack info

 include/uapi/linux/tc_act/tc_ct.h |  41 ++++
 man/man8/tc-flower.8              |  35 +++
 tc/Makefile                       |   1 +
 tc/f_flower.c                     | 276 ++++++++++++++++++++-
 tc/m_action.c                     |   3 +-
 tc/m_ct.c                         | 497 ++++++++++++++++++++++++++++++++++++++
 tc/tc_util.c                      |  44 ++++
 tc/tc_util.h                      |   4 +
 8 files changed, 899 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/tc_act/tc_ct.h
 create mode 100644 tc/m_ct.c

-- 
1.8.3.1


^ permalink raw reply

* Re: [PATCH net-next 1/3] net: flow_offload: remove netns parameter from flow_block_cb_alloc()
From: Jiri Pirko @ 2019-07-11  8:08 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, jakub.kicinski
In-Reply-To: <20190711001235.20686-1-pablo@netfilter.org>

Thu, Jul 11, 2019 at 02:12:33AM CEST, pablo@netfilter.org wrote:
>No need to annotate the netns on the flow block callback object,
>flow_block_cb_is_busy() already checks for used blocks.
>
>Fixes: d63db30c8537 ("net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free()")
>Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: flow_offload: rename tc_setup_cb_t to flow_setup_cb_t
From: Jiri Pirko @ 2019-07-11  8:07 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, jakub.kicinski
In-Reply-To: <20190711001235.20686-2-pablo@netfilter.org>

Thu, Jul 11, 2019 at 02:12:34AM CEST, pablo@netfilter.org wrote:
>Rename this type definition and adapt users.
>
>Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: flow_offload: add flow_block structure and use it
From: Jiri Pirko @ 2019-07-11  8:06 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, jakub.kicinski
In-Reply-To: <20190711001235.20686-3-pablo@netfilter.org>

Thu, Jul 11, 2019 at 02:12:35AM CEST, pablo@netfilter.org wrote:
>This object stores the flow block callbacks that are attached to this
>block. This patch restores block sharing.
>
>Fixes: da3eeb904ff4 ("net: flow_offload: add list handling functions")
>Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>---
> include/net/flow_offload.h        |  5 +++++
> include/net/netfilter/nf_tables.h |  5 +++--
> include/net/sch_generic.h         |  2 +-
> net/core/flow_offload.c           |  2 +-
> net/netfilter/nf_tables_api.c     |  2 +-
> net/netfilter/nf_tables_offload.c |  5 +++--
> net/sched/cls_api.c               | 10 +++++++---
> 7 files changed, 21 insertions(+), 10 deletions(-)
>
>diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
>index 98bf3af5c84d..e50d94736829 100644
>--- a/include/net/flow_offload.h
>+++ b/include/net/flow_offload.h
>@@ -248,6 +248,10 @@ enum flow_block_binder_type {
> 	FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
> };
> 
>+struct flow_block {
>+	struct list_head cb_list;
>+};
>+
> struct netlink_ext_ack;
> 
> struct flow_block_offload {
>@@ -255,6 +259,7 @@ struct flow_block_offload {
> 	enum flow_block_binder_type binder_type;
> 	bool block_shared;
> 	struct net *net;
>+	struct flow_block *block;
> 	struct list_head cb_list;
> 	struct list_head *driver_block_list;
> 	struct netlink_ext_ack *extack;
>diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
>index 35dfdd9f69b3..00658462f89b 100644
>--- a/include/net/netfilter/nf_tables.h
>+++ b/include/net/netfilter/nf_tables.h
>@@ -11,6 +11,7 @@
> #include <linux/rhashtable.h>
> #include <net/netfilter/nf_flow_table.h>
> #include <net/netlink.h>
>+#include <net/flow_offload.h>
> 
> struct module;
> 
>@@ -951,7 +952,7 @@ struct nft_stats {
>  *	@stats: per-cpu chain stats
>  *	@chain: the chain
>  *	@dev_name: device name that this base chain is attached to (if any)
>- *	@cb_list: list of flow block callbacks (for hardware offload)
>+ *	@block: flow block (for hardware offload)
>  */
> struct nft_base_chain {
> 	struct nf_hook_ops		ops;
>@@ -961,7 +962,7 @@ struct nft_base_chain {
> 	struct nft_stats __percpu	*stats;
> 	struct nft_chain		chain;
> 	char 				dev_name[IFNAMSIZ];
>-	struct list_head		cb_list;
>+	struct flow_block		block;
> };
> 
> static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
>diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>index 9482e060483b..58041cb0ce15 100644
>--- a/include/net/sch_generic.h
>+++ b/include/net/sch_generic.h
>@@ -399,7 +399,7 @@ struct tcf_block {
> 	refcount_t refcnt;
> 	struct net *net;
> 	struct Qdisc *q;
>-	struct list_head cb_list;
>+	struct flow_block flow;

It is not a "flow", that is confusing. It should be named "flow_block".


> 	struct list_head owner_list;
> 	bool keep_dst;
> 	unsigned int offloadcnt; /* Number of oddloaded filters */
>diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
>index a800fa78d96c..935c7f81a9ef 100644
>--- a/net/core/flow_offload.c
>+++ b/net/core/flow_offload.c
>@@ -198,7 +198,7 @@ struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f,
> {
> 	struct flow_block_cb *block_cb;
> 
>-	list_for_each_entry(block_cb, f->driver_block_list, driver_list) {
>+	list_for_each_entry(block_cb, &f->block->cb_list, list) {

Please made struct flow_block *block and argument of cb_lookup instead
of struct flow_block_offload *f (as it was previously).


> 		if (block_cb->cb == cb &&
> 		    block_cb->cb_ident == cb_ident)
> 			return block_cb;
>diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
>index ed17a7c29b86..c565f146435b 100644
>--- a/net/netfilter/nf_tables_api.c
>+++ b/net/netfilter/nf_tables_api.c
>@@ -1662,7 +1662,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
> 
> 		chain->flags |= NFT_BASE_CHAIN | flags;
> 		basechain->policy = NF_ACCEPT;
>-		INIT_LIST_HEAD(&basechain->cb_list);
>+		INIT_LIST_HEAD(&basechain->block.cb_list);
> 	} else {
> 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
> 		if (chain == NULL)
>diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
>index 2c3302845f67..2a184277ee58 100644
>--- a/net/netfilter/nf_tables_offload.c
>+++ b/net/netfilter/nf_tables_offload.c
>@@ -116,7 +116,7 @@ static int nft_setup_cb_call(struct nft_base_chain *basechain,
> 	struct flow_block_cb *block_cb;
> 	int err;
> 
>-	list_for_each_entry(block_cb, &basechain->cb_list, list) {
>+	list_for_each_entry(block_cb, &basechain->block.cb_list, list) {
> 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
> 		if (err < 0)
> 			return err;
>@@ -154,7 +154,7 @@ static int nft_flow_offload_rule(struct nft_trans *trans,
> static int nft_flow_offload_bind(struct flow_block_offload *bo,
> 				 struct nft_base_chain *basechain)
> {
>-	list_splice(&bo->cb_list, &basechain->cb_list);
>+	list_splice(&bo->cb_list, &basechain->block.cb_list);
> 	return 0;
> }
> 
>@@ -198,6 +198,7 @@ static int nft_flow_offload_chain(struct nft_trans *trans,
> 		return -EOPNOTSUPP;
> 
> 	bo.command = cmd;
>+	bo.block = &basechain->block;
> 	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
> 	bo.extack = &extack;
> 	INIT_LIST_HEAD(&bo.cb_list);
>diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>index 51fbe6e95a92..66181961ad6f 100644
>--- a/net/sched/cls_api.c
>+++ b/net/sched/cls_api.c
>@@ -691,6 +691,8 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
> 	if (!indr_dev->block)
> 		return;
> 
>+	bo.block = &indr_dev->block->flow;
>+
> 	indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
> 			  &bo);
> 	tcf_block_setup(indr_dev->block, &bo);
>@@ -775,6 +777,7 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
> 		.command	= command,
> 		.binder_type	= ei->binder_type,
> 		.net		= dev_net(dev),
>+		.block		= &block->flow,
> 		.block_shared	= tcf_block_shared(block),
> 		.extack		= extack,
> 	};
>@@ -810,6 +813,7 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
> 	bo.net = dev_net(dev);
> 	bo.command = command;
> 	bo.binder_type = ei->binder_type;
>+	bo.block = &block->flow;
> 	bo.block_shared = tcf_block_shared(block);
> 	bo.extack = extack;
> 	INIT_LIST_HEAD(&bo.cb_list);
>@@ -988,7 +992,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
> 	}
> 	mutex_init(&block->lock);
> 	INIT_LIST_HEAD(&block->chain_list);
>-	INIT_LIST_HEAD(&block->cb_list);
>+	INIT_LIST_HEAD(&block->flow.cb_list);

With introduction of struct flow_block, please introduce also a helper
to init this struct. Does not look right to init it from user codes
(tc/nft).


> 	INIT_LIST_HEAD(&block->owner_list);
> 	INIT_LIST_HEAD(&block->chain0.filter_chain_list);
> 
>@@ -1570,7 +1574,7 @@ static int tcf_block_bind(struct tcf_block *block,
> 
> 		i++;
> 	}
>-	list_splice(&bo->cb_list, &block->cb_list);
>+	list_splice(&bo->cb_list, &block->flow.cb_list);
> 
> 	return 0;
> 
>@@ -3155,7 +3159,7 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
> 	if (block->nooffloaddevcnt && err_stop)
> 		return -EOPNOTSUPP;
> 
>-	list_for_each_entry(block_cb, &block->cb_list, list) {
>+	list_for_each_entry(block_cb, &block->flow.cb_list, list) {
> 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
> 		if (err) {
> 			if (err_stop)
>-- 
>2.11.0
>
>

^ permalink raw reply

* [PATCH net-next] netfilter: nf_table_offload: Fix zero prio of flow_cls_common_offload
From: wenxu @ 2019-07-11  8:03 UTC (permalink / raw)
  To: pablo, davem; +Cc: netfilter-devel, netdev

From: wenxu <wenxu@ucloud.cn>

The flow_cls_common_offload prio should be not zero

It leads the invalid table prio in hw.

# nft add table netdev firewall
# nft add chain netdev firewall acl { type filter hook ingress device mlx_pf0vf0 priority - 300 \; }
# nft add rule netdev firewall acl ip daddr 1.1.1.7 drop
Error: Could not process rule: Invalid argument

kernel log
mlx5_core 0000:81:00.0: E-Switch: Failed to create FDB Table err -22 (table prio: 65535, level: 0, size: 4194304)

Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/netfilter/nf_tables_offload.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 2c33028..01d8133 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -7,6 +7,8 @@
 #include <net/netfilter/nf_tables_offload.h>
 #include <net/pkt_cls.h>
 
+#define FLOW_OFFLOAD_DEFAUT_PRIO 1U
+
 static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
 {
 	struct nft_flow_rule *flow;
@@ -107,6 +109,7 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
 					struct netlink_ext_ack *extack)
 {
 	common->protocol = proto;
+	common->prio = TC_H_MAKE(FLOW_OFFLOAD_DEFAUT_PRIO << 16, 0);
 	common->extack = extack;
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* Re:  Re: linux-next: build failure after merge of the net-next tree
From: Bernard Metzler @ 2019-07-11  8:00 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Stephen Rothwell, Doug Ledford, David Miller,
	Networking, Linux Next Mailing List, Linux Kernel Mailing List
In-Reply-To: <20190710175212.GM2887@mellanox.com>

-----"Jason Gunthorpe" <jgg@mellanox.com> wrote: -----

>To: "Leon Romanovsky" <leon@kernel.org>, "Bernard Metzler"
><bmt@zurich.ibm.com>
>From: "Jason Gunthorpe" <jgg@mellanox.com>
>Date: 07/10/2019 07:52PM
>Cc: "Stephen Rothwell" <sfr@canb.auug.org.au>, "Doug Ledford"
><dledford@redhat.com>, "David Miller" <davem@davemloft.net>,
>"Networking" <netdev@vger.kernel.org>, "Linux Next Mailing List"
><linux-next@vger.kernel.org>, "Linux Kernel Mailing List"
><linux-kernel@vger.kernel.org>
>Subject: [EXTERNAL] Re: linux-next: build failure after merge of the
>net-next tree
>
>On Tue, Jul 09, 2019 at 09:43:46AM +0300, Leon Romanovsky wrote:
>> On Tue, Jul 09, 2019 at 01:56:36PM +1000, Stephen Rothwell wrote:
>> > Hi all,
>> >
>> > After merging the net-next tree, today's linux-next build (x86_64
>> > allmodconfig) failed like this:
>> >
>> > drivers/infiniband/sw/siw/siw_cm.c: In function
>'siw_create_listen':
>> > drivers/infiniband/sw/siw/siw_cm.c:1978:3: error: implicit
>declaration of function 'for_ifa'; did you mean 'fork_idle'?
>[-Werror=implicit-function-declaration]
>> >    for_ifa(in_dev)
>> >    ^~~~~~~
>> >    fork_idle
>> > drivers/infiniband/sw/siw/siw_cm.c:1978:18: error: expected ';'
>before '{' token
>> >    for_ifa(in_dev)
>> >                   ^
>> >                   ;
>> >    {
>> >    ~
>> >
>> > Caused by commit
>> >
>> >   6c52fdc244b5 ("rdma/siw: connection management")
>> >
>> > from the rdma tree.  I don't know why this didn't fail after I
>mereged
>> > that tree.
>> 
>> I had the same question, because I have this fix for a couple of
>days already.
>> 
>> From 56c9e15ec670af580daa8c3ffde9503af3042d67 Mon Sep 17 00:00:00
>2001
>> From: Leon Romanovsky <leonro@mellanox.com>
>> Date: Sun, 7 Jul 2019 10:43:42 +0300
>> Subject: [PATCH] Fixup to build SIW issue
>> 
>> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
>>  drivers/infiniband/sw/siw/siw_cm.c | 5 ++---
>>  1 file changed, 2 insertions(+), 3 deletions(-)
>> 
>> diff --git a/drivers/infiniband/sw/siw/siw_cm.c
>b/drivers/infiniband/sw/siw/siw_cm.c
>> index 8e618cb7261f..c883bf514341 100644
>> +++ b/drivers/infiniband/sw/siw/siw_cm.c
>> @@ -1954,6 +1954,7 @@ static void siw_drop_listeners(struct
>iw_cm_id *id)
>>  int siw_create_listen(struct iw_cm_id *id, int backlog)
>>  {
>>  	struct net_device *dev = to_siw_dev(id->device)->netdev;
>> +	const struct in_ifaddr *ifa;
>>  	int rv = 0, listeners = 0;
>> 
>>  	siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog);
>> @@ -1975,8 +1976,7 @@ int siw_create_listen(struct iw_cm_id *id,
>int backlog)
>>  			id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port),
>>  			&s_raddr->sin_addr, ntohs(s_raddr->sin_port));
>> 
>> -		for_ifa(in_dev)
>> -		{
>> +		in_dev_for_each_ifa_rcu(ifa, in_dev) {
>>  			if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
>
>Hum. There is no rcu lock held here and we can't use RCU anyhow as
>siw_listen_address will sleep.
>
>I think this needs to use rtnl, as below. Bernard, please urgently
>confirm. Thanks
>

Hi Jason,

That listen will not sleep. The socket is just marked
listening. Accepting a new connection is handled asynchronously
by a work handler, kicked by a socket callback
(siw_cm_llp_state_change).

But, I think you are correct, we are missing the
rcu_read_lock/unlock around that iteration. Could we please add
that (see below)?

Thanks very much!
Bernard.

diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index c883bf514341..c5c060103993 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1976,6 +1976,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
                        id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port),
                        &s_raddr->sin_addr, ntohs(s_raddr->sin_port));
 
+               rcu_read_lock();
                in_dev_for_each_ifa_rcu(ifa, in_dev) {
                        if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
                            s_laddr.sin_addr.s_addr == ifa->ifa_address) {
@@ -1988,6 +1989,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
                                        listeners++;
                        }
                }
+               rcu_read_unlock();
                in_dev_put(in_dev);
        } else if (id->local_addr.ss_family == AF_INET6) {
                struct inet6_dev *in6_dev = in6_dev_get(dev);



>diff --git a/drivers/infiniband/sw/siw/siw_cm.c
>b/drivers/infiniband/sw/siw/siw_cm.c
>index 8e618cb7261f62..ee98e96a5bfaba 100644
>--- a/drivers/infiniband/sw/siw/siw_cm.c
>+++ b/drivers/infiniband/sw/siw/siw_cm.c
>@@ -1965,6 +1965,7 @@ int siw_create_listen(struct iw_cm_id *id, int
>backlog)
> 	 */
> 	if (id->local_addr.ss_family == AF_INET) {
> 		struct in_device *in_dev = in_dev_get(dev);
>+		const struct in_ifaddr *ifa;
> 		struct sockaddr_in s_laddr, *s_raddr;
> 
> 		memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr));
>@@ -1975,8 +1976,8 @@ int siw_create_listen(struct iw_cm_id *id, int
>backlog)
> 			id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port),
> 			&s_raddr->sin_addr, ntohs(s_raddr->sin_port));
> 
>-		for_ifa(in_dev)
>-		{
>+		rtnl_lock();
>+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
> 			if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
> 			    s_laddr.sin_addr.s_addr == ifa->ifa_address) {
> 				s_laddr.sin_addr.s_addr = ifa->ifa_address;
>@@ -1988,7 +1989,7 @@ int siw_create_listen(struct iw_cm_id *id, int
>backlog)
> 					listeners++;
> 			}
> 		}
>-		endfor_ifa(in_dev);
>+		rtnl_unlock();
> 		in_dev_put(in_dev);
> 	} else if (id->local_addr.ss_family == AF_INET6) {
> 		struct inet6_dev *in6_dev = in6_dev_get(dev);
>
>


^ permalink raw reply related

* Re: NEIGH: BUG, double timer add, state is 8
From: Lorenzo Bianconi @ 2019-07-11  7:54 UTC (permalink / raw)
  To: David Ahern; +Cc: Marek Majkowski, David Miller, netdev, kernel-team
In-Reply-To: <c383ea93-4257-f31c-e259-f71169f7baef@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 439 bytes --]

> On 7/5/19 11:30 AM, Lorenzo Bianconi wrote:
> > looking at the reproducer it seems to me the issue is due to the use of
> > 'NTF_USE' from userspace.
> > Should we unschedule the neigh timer if we are in IN_TIMER receiving this
> > flag from userspace? (taking appropriate locking)
> 
> I think you are right. Do you want to send a patch?

Hi David,

thx for the feedback. Sure, I will post a patch soon.

Regards,
Lorenzo

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* RE: Question about linux kernel commit: "net/ipv6: move metrics from dst to rt6_info"
From: Jan Szewczyk @ 2019-07-11  7:41 UTC (permalink / raw)
  To: David Ahern, Stefano Brivio
  Cc: davem@davemloft.net, netdev@vger.kernel.org, Wei Wang,
	Martin KaFai Lau, Eric Dumazet
In-Reply-To: <5981b8d0-cfdf-d230-fa22-cfcfaa5ee4b9@gmail.com>

Hi guys!

Yes, that's exactly it! Thank you very much, so now I know what is happening 😊.

Thanks again for your help!

BR,
Jan Szewczyk

-----Original Message-----
From: David Ahern <dsahern@gmail.com> 
Sent: Wednesday, July 10, 2019 21:13
To: Stefano Brivio <sbrivio@redhat.com>; Jan Szewczyk <jan.szewczyk@ericsson.com>
Cc: davem@davemloft.net; netdev@vger.kernel.org; Wei Wang <weiwan@google.com>; Martin KaFai Lau <kafai@fb.com>; Eric Dumazet <edumazet@google.com>
Subject: Re: Question about linux kernel commit: "net/ipv6: move metrics from dst to rt6_info"

On 7/10/19 1:09 PM, Stefano Brivio wrote:
> Jan,
> 
> On Wed, 10 Jul 2019 12:59:41 +0000
> Jan Szewczyk <jan.szewczyk@ericsson.com> wrote:
> 
>> Hi!
>> I digged up a little further and maybe it's not a problem with MTU 
>> itself. I checked every entry I get from RTM_GETROUTE netlink message 
>> and after triggering "too big packet" by pinging ipv6address I get 
>> exactly the same messages on 4.12 and 4.18, except that the one with 
>> that pinged ipv6address is missing on 4.18 at all. What is weird - 
>> it's visible when running "ip route get to ipv6address". Do you know 
>> why there is a mismatch there?
> 
> If I understand you correctly, an implementation equivalent to 'ip -6 
> route list show' (using the NLM_F_DUMP flag) won't show the so-called 
> route exception, while 'ip -6 route get' shows it.
> 
> If that's the case: that was broken by commit 2b760fcf5cfb ("ipv6: 
> hook up exception table to store dst cache") that landed in 4.15, and 
> fixed by net-next commit 1e47b4837f3b ("ipv6: Dump route exceptions if 
> requested"). For more details, see the log of this commit itself.
> 

ah, good point. My mind locked on RTM_GETROUTE as a specific route request not a dump.

^ permalink raw reply

* Re: [RFC] virtio-net: share receive_*() and add_recvbuf_*() with virtio-vsock
From: Jason Wang @ 2019-07-11  7:37 UTC (permalink / raw)
  To: Stefano Garzarella, Michael S. Tsirkin, Stefan Hajnoczi
  Cc: virtualization, netdev
In-Reply-To: <20190710153707.twmzgmwqqw3pstos@steredhat>

On 2019/7/10 下午11:37, Stefano Garzarella wrote:
> Hi,
> as Jason suggested some months ago, I looked better at the virtio-net driver to
> understand if we can reuse some parts also in the virtio-vsock driver, since we
> have similar challenges (mergeable buffers, page allocation, small
> packets, etc.).
>
> Initially, I would add the skbuff in the virtio-vsock in order to re-use
> receive_*() functions.

Yes, that will be a good step.

> Then I would move receive_[small, big, mergeable]() and
> add_recvbuf_[small, big, mergeable]() outside of virtio-net driver, in order to
> call them also from virtio-vsock. I need to do some refactoring (e.g. leave the
> XDP part on the virtio-net driver), but I think it is feasible.
>
> The idea is to create a virtio-skb.[h,c] where put these functions and a new
> object where stores some attributes needed (e.g. hdr_len ) and status (e.g.
> some fields of struct receive_queue).

My understanding is we could be more ambitious here. Do you see any 
blocker for reusing virtio-net directly? It's better to reuse not only 
the functions but also the logic like NAPI to avoid re-inventing 
something buggy and duplicated.

> This is an idea of virtio-skb.h that
> I have in mind:
>      struct virtskb;

What fields do you want to store in virtskb? It looks to be exist 
sk_buff is flexible enough to us?

>
>      struct sk_buff *virtskb_receive_small(struct virtskb *vs, ...);
>      struct sk_buff *virtskb_receive_big(struct virtskb *vs, ...);
>      struct sk_buff *virtskb_receive_mergeable(struct virtskb *vs, ...);
>
>      int virtskb_add_recvbuf_small(struct virtskb*vs, ...);
>      int virtskb_add_recvbuf_big(struct virtskb *vs, ...);
>      int virtskb_add_recvbuf_mergeable(struct virtskb *vs, ...);
>
> For the Guest->Host path it should be easier, so maybe I can add a
> "virtskb_send(struct virtskb *vs, struct sk_buff *skb)" with a part of the code
> of xmit_skb().

I may miss something, but I don't see any thing that prevents us from 
using xmit_skb() directly.

>
> Let me know if you have in mind better names or if I should put these function
> in another place.
>
> I would like to leave the control part completely separate, so, for example,
> the two drivers will negotiate the features independently and they will call
> the right virtskb_receive_*() function based on the negotiation.

If it's one the issue of negotiation, we can simply change the 
virtnet_probe() to deal with different devices.

>
> I already started to work on it, but before to do more steps and send an RFC
> patch, I would like to hear your opinion.
> Do you think that makes sense?
> Do you see any issue or a better solution?

I still think we need to seek a way of adding some codes on virtio-net.c 
directly if there's no huge different in the processing of TX/RX. That 
would save us a lot time.

Thanks

>
> Thanks in advance,
> Stefano

^ permalink raw reply

* Re: [PATCH net 2/4] tcp: tcp_fragment() should apply sane memory limits
From: Christoph Paasch @ 2019-07-11  7:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Prout, Andrew - LLSC - MITLL, David Miller, netdev,
	Greg Kroah-Hartman, Jonathan Looney, Neal Cardwell, Tyler Hicks,
	Yuchung Cheng, Bruce Curtis, Jonathan Lemon, Dustin Marquess
In-Reply-To: <b1dfd327-a784-6609-3c83-dab42c3c7eda@gmail.com>



> On Jul 10, 2019, at 9:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> 
> 
> On 7/10/19 8:53 PM, Prout, Andrew - LLSC - MITLL wrote:
>> 
>> Our initial rollout was v4.14.130, but I reproduced it with v4.14.132 as well, reliably for the samba test and once (not reliably) with synthetic test I was trying. A patched v4.14.132 with this patch partially reverted (just the four lines from tcp_fragment deleted) passed the samba test.
>> 
>> The synthetic test was a pair of simple send/recv test programs under the following conditions:
>> -The send socket was non-blocking
>> -SO_SNDBUF set to 128KiB
>> -The receiver NIC was being flooded with traffic from multiple hosts (to induce packet loss/retransmits)
>> -Load was on both systems: a while(1) program spinning on each CPU core
>> -The receiver was on an older unaffected kernel
>> 
> 
> SO_SNDBUF to 128KB does not permit to recover from heavy losses,
> since skbs needs to be allocated for retransmits.

Would it make sense to always allow the alloc in tcp_fragment when coming from __tcp_retransmit_skb() through the retransmit-timer ?

AFAICS, the crasher was when an attacker sends "fake" SACK-blocks. Thus, we would still be protected from too much fragmentation, but at least would always allow the retransmission to go out.


Christoph

> 
> The bug we fixed allowed remote attackers to crash all linux hosts,
> 
> I am afraid we have to enforce the real SO_SNDBUF limit, finally.
> 
> Even a cushion of 128KB per socket is dangerous, for servers with millions of TCP sockets.
> 
> You will either have to set SO_SNDBUF to higher values, or let autotuning in place.
> Or revert the patches and allow attackers hit you badly.
> 


^ permalink raw reply

* RE: [PATCH v6 rdma-next 0/6] RDMA/qedr: Use the doorbell overflow recovery mechanism for RDMA
From: Michal Kalderon @ 2019-07-11  7:23 UTC (permalink / raw)
  To: Gal Pressman, Ariel Elior, jgg@ziepe.ca, dledford@redhat.com
  Cc: linux-rdma@vger.kernel.org, davem@davemloft.net,
	netdev@vger.kernel.org, sleybo@amazon.com
In-Reply-To: <7b2f2205-6b5d-c9e7-2d59-296367e517ac@amazon.com>

> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Gal Pressman
> 
> On 09/07/2019 17:17, Michal Kalderon wrote:
> > This patch series uses the doorbell overflow recovery mechanism
> > introduced in commit 36907cd5cd72 ("qed: Add doorbell overflow
> > recovery mechanism") for rdma ( RoCE and iWARP )
> >
> > The first three patches modify the core code to contain helper
> > functions for managing mmap_xa inserting, getting and freeing entries.
> > The code was taken almost as is from the efa driver.
> > There is still an open discussion on whether we should take this even
> > further and make the entire mmap generic. Until a decision is made, I
> > only created the database API and modified the efa and qedr driver to
> > use it. The doorbell recovery code will be based on the common code.
> >
> > Efa driver was compile tested only.
> 
> For the whole series:
> Tested-by: Gal Pressman <galpress@amazon.com>

Thanks Gal!


^ permalink raw reply

* Re: [PATCH net-next iproute2 2/3] tc: Introduce tc ct action
From: Paul Blakey @ 2019-07-11  7:21 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Jiri Pirko, Roi Dayan, Yossi Kuperman, Oz Shlomo,
	netdev@vger.kernel.org, David Miller, Aaron Conole, Zhike Wang,
	Justin Pettit, John Hurley, Rony Efraim, nst-kernel@redhat.com,
	Simon Horman
In-Reply-To: <20190709153657.GF3390@localhost.localdomain>


On 7/9/2019 6:36 PM, Marcelo Ricardo Leitner wrote:
> On Tue, Jul 09, 2019 at 06:58:36AM +0000, Paul Blakey wrote:
>> On 7/8/2019 8:54 PM, Marcelo Ricardo Leitner wrote:
>>> On Sun, Jul 07, 2019 at 11:53:47AM +0300, Paul Blakey wrote:
>>>> New tc action to send packets to conntrack module, commit
>>>> them, and set a zone, labels, mark, and nat on the connection.
>>>>
>>>> It can also clear the packet's conntrack state by using clear.
>>>>
>>>> Usage:
>>>>      ct clear
>>>>      ct commit [force] [zone] [mark] [label] [nat]
>>> Isn't the 'commit' also optional? More like
>>>       ct [commit [force]] [zone] [mark] [label] [nat]
>>>
>>>>      ct [nat] [zone]
>>>>
>>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
>>>> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
>>>> Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
>>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>>>> Acked-by: Roi Dayan <roid@mellanox.com>
>>>> ---
>>> ...
>>>> +static void
>>>> +usage(void)
>>>> +{
>>>> +	fprintf(stderr,
>>>> +		"Usage: ct clear\n"
>>>> +		"	ct commit [force] [zone ZONE] [mark MASKED_MARK] [label MASKED_LABEL] [nat NAT_SPEC]\n"
>>> Ditto here then.
>>
>> In commit msg and here, it means there is multiple modes of operation. I
>> think it's easier to split those.
> Yep, that is good.
> More below.
>
>> "ct clear" to clear it , not other options can be added here.
>>
>> "ct commit  [force].... " sends to conntrack and commit a connection,
>> and only for commit can you specify force mark  label, and nat with
>> nat_spec....
>>
>> and the last one, "ct [nat] [zone ZONE]" is to just send the packet to
>> conntrack on some zone [optional], restore nat [optional].
>>
>>
>>>> +		"	ct [nat] [zone ZONE]\n"
>>>> +		"Where: ZONE is the conntrack zone table number\n"
>>>> +		"	NAT_SPEC is {src|dst} addr addr1[-addr2] [port port1[-port2]]\n"
>>>> +		"\n");
>>>> +	exit(-1);
>>>> +}
>>> ...
>>>
>>> The validation below doesn't enforce that commit must be there for
>>> such case.
>> which case? commit is optional. the above are the three valid patterns.
> That's the point. But the 2nd example is saying 'commit' word is
> mandatory in that mode. It is written as it is a command that was
> selected.
>
> One may use just:
>      ct [zone]
> And not
>      ct commit [zone]
> Right?

It is optional in the overall syntax.


But I split it into modes:

clear, commit, and "restore" (I unofficial call it like that, because it 
usually used to get the +est state on the packet and can restore nat, it 
doesn't actually restore anything for the first packet on the -trk rule)

It is mandatory in the second mode (commit), if you don't specify commit 
or clear, you can only use the third form - "restore", which is to send 
to ct on some optional zone, and optionally and restore nat (so we get 
ct [zone] [nat]).

I think this syntax is easy, maybe I can label them as the modes of 
operation above (then I'll need to name the restore one better :)).

If there is a different syntax you think might be easier I'll change to 
that.


Thanks,

Paul.







^ permalink raw reply

* Re: [PATCH net-next v6 0/4] net/sched: Introduce tc connection tracking
From: Paul Blakey @ 2019-07-11  7:12 UTC (permalink / raw)
  To: David Miller
  Cc: Jiri Pirko, Roi Dayan, Yossi Kuperman, Oz Shlomo,
	marcelo.leitner@gmail.com, netdev@vger.kernel.org,
	aconole@redhat.com, wangzhike@jd.com, Rony Efraim,
	nst-kernel@redhat.com, john.hurley@netronome.com,
	simon.horman@netronome.com, jpettit@ovn.org
In-Reply-To: <20190709.121402.1804664264408465946.davem@davemloft.net>


On 7/9/2019 10:14 PM, David Miller wrote:
> From: Paul Blakey <paulb@mellanox.com>
> Date: Tue,  9 Jul 2019 10:30:47 +0300
>
>> This patch series add connection tracking capabilities in tc sw datapath.
>> It does so via a new tc action, called act_ct, and new tc flower classifier matching
>> on conntrack state, mark and label.
>   ...
>
> Ok, I applied this, but two things:
>
> 1) You owe Cong Wang an explanation, a real detailed one, about the L2
>     vs L3 design of this feature.  I did not see you address his feedback,
>     but if you did I apologize.
>
> 2) Because the MPLS changes went in first, TCA_ID_CT ended up in a
>     different spot in the enumeration and therefore the value is
>     different.
>
> Thanks.



Thanks!

Re 1, I provided one in "Re: [PATCH net-next v2 0/4] net/sched: 
Introduce tc connection tracking", hope that's enough.


^ permalink raw reply

* [PATCH v2 bpf-next 2/3] selftests/bpf: add trickier size resolution tests
From: Andrii Nakryiko @ 2019-07-11  6:53 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko
In-Reply-To: <20190711065307.2425636-1-andriin@fb.com>

Add more BTF tests, validating that size resolution logic is correct in
few trickier cases.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/testing/selftests/bpf/test_btf.c | 88 ++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 8351cb5f4a20..3d617e806054 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -3417,6 +3417,94 @@ static struct btf_raw_test raw_tests[] = {
 	.value_type_id = 1,
 	.max_entries = 4,
 },
+/*
+ * typedef int arr_t[16];
+ * struct s {
+ *	arr_t *a;
+ * };
+ */
+{
+	.descr = "struct->ptr->typedef->array->int size resolution",
+	.raw_types = {
+		BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [1] */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+		BTF_PTR_ENC(3),					/* [2] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		BTF_TYPE_ARRAY_ENC(5, 5, 16),			/* [4] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [5] */
+		BTF_END_RAW,
+	},
+	BTF_STR_SEC("\0s\0a\0arr_t"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "ptr_mod_chain_size_resolve_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int) * 16,
+	.key_type_id = 5 /* int */,
+	.value_type_id = 3 /* arr_t */,
+	.max_entries = 4,
+},
+/*
+ * typedef int arr_t[16][8][4];
+ * struct s {
+ *	arr_t *a;
+ * };
+ */
+{
+	.descr = "struct->ptr->typedef->multi-array->int size resolution",
+	.raw_types = {
+		BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [1] */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+		BTF_PTR_ENC(3),					/* [2] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		BTF_TYPE_ARRAY_ENC(5, 7, 16),			/* [4] */
+		BTF_TYPE_ARRAY_ENC(6, 7, 8),			/* [5] */
+		BTF_TYPE_ARRAY_ENC(7, 7, 4),			/* [6] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [7] */
+		BTF_END_RAW,
+	},
+	BTF_STR_SEC("\0s\0a\0arr_t"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "multi_arr_size_resolve_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int) * 16 * 8 * 4,
+	.key_type_id = 7 /* int */,
+	.value_type_id = 3 /* arr_t */,
+	.max_entries = 4,
+},
+/*
+ * typedef int int_t;
+ * typedef int_t arr3_t[4];
+ * typedef arr3_t arr2_t[8];
+ * typedef arr2_t arr1_t[16];
+ * struct s {
+ *	arr1_t *a;
+ * };
+ */
+{
+	.descr = "typedef/multi-arr mix size resolution",
+	.raw_types = {
+		BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [1] */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+		BTF_PTR_ENC(3),					/* [2] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		BTF_TYPE_ARRAY_ENC(5, 10, 16),			/* [4] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 6),			/* [5] */
+		BTF_TYPE_ARRAY_ENC(7, 10, 8),			/* [6] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 8),			/* [7] */
+		BTF_TYPE_ARRAY_ENC(9, 10, 4),			/* [8] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 10),			/* [9] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [10] */
+		BTF_END_RAW,
+	},
+	BTF_STR_SEC("\0s\0a\0arr1_t\0arr2_t\0arr3_t\0int_t"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "typedef_arra_mix_size_resolve_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int) * 16 * 8 * 4,
+	.key_type_id = 10 /* int */,
+	.value_type_id = 3 /* arr_t */,
+	.max_entries = 4,
+},
 
 }; /* struct btf_raw_test raw_tests[] */
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 bpf-next 3/3] selftests/bpf: use typedef'ed arrays as map values
From: Andrii Nakryiko @ 2019-07-11  6:53 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko
In-Reply-To: <20190711065307.2425636-1-andriin@fb.com>

Convert few tests that couldn't use typedef'ed arrays due to kernel bug.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c     | 3 ++-
 tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c | 3 +--
 tools/testing/selftests/bpf/progs/test_stacktrace_map.c      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c
index d06b47a09097..33254b771384 100644
--- a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c
+++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c
@@ -47,11 +47,12 @@ struct {
  * issue and avoid complicated C programming massaging.
  * This is an acceptable workaround since there is one entry here.
  */
+typedef __u64 raw_stack_trace_t[2 * MAX_STACK_RAWTP];
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(max_entries, 1);
 	__type(key, __u32);
-	__u64 (*value)[2 * MAX_STACK_RAWTP];
+	__type(value, raw_stack_trace_t);
 } rawdata_map SEC(".maps");
 
 SEC("tracepoint/raw_syscalls/sys_enter")
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c
index bbfc8337b6f0..f5638e26865d 100644
--- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c
@@ -36,8 +36,7 @@ struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, 128);
 	__type(key, __u32);
-	/* there seems to be a bug in kernel not handling typedef properly */
-	struct bpf_stack_build_id (*value)[PERF_MAX_STACK_DEPTH];
+	__type(value, stack_trace_t);
 } stack_amap SEC(".maps");
 
 /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
index 803c15dc109d..fa0be3e10a10 100644
--- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
@@ -35,7 +35,7 @@ struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, 16384);
 	__type(key, __u32);
-	__u64 (*value)[PERF_MAX_STACK_DEPTH];
+	__type(value, stack_trace_t);
 } stack_amap SEC(".maps");
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 bpf-next 0/3] fix BTF verification size resolution
From: Andrii Nakryiko @ 2019-07-11  6:53 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko

BTF size resolution logic isn't always resolving type size correctly, leading
to erroneous map creation failures due to value size mismatch.

This patch set:
1. fixes the issue (patch #1);
2. adds tests for trickier cases (patch #2);
3. and converts few test cases utilizing BTF-defined maps, that previously
   couldn't use typedef'ed arrays due to kernel bug (patch #3).

Patch #1 can be applied against bpf tree, but selftest ones (#2 and #3) have
to go against bpf-next for now.

Andrii Nakryiko (3):
  bpf: fix BTF verifier size resolution logic
  selftests/bpf: add trickier size resolution tests
  selftests/bpf: use typedef'ed arrays as map values

 kernel/bpf/btf.c                              | 14 ++-
 .../bpf/progs/test_get_stack_rawtp.c          |  3 +-
 .../bpf/progs/test_stacktrace_build_id.c      |  3 +-
 .../selftests/bpf/progs/test_stacktrace_map.c |  2 +-
 tools/testing/selftests/bpf/test_btf.c        | 88 +++++++++++++++++++
 5 files changed, 102 insertions(+), 8 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH v2 bpf-next 1/3] bpf: fix BTF verifier size resolution logic
From: Andrii Nakryiko @ 2019-07-11  6:53 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko, Martin KaFai Lau
In-Reply-To: <20190711065307.2425636-1-andriin@fb.com>

BTF verifier has a size resolution bug which in some circumstances leads to
invalid size resolution for, e.g., TYPEDEF modifier.  This happens if we have
[1] PTR -> [2] TYPEDEF -> [3] ARRAY, in which case due to being in pointer
context ARRAY size won't be resolved (because for pointer it doesn't matter, so
it's a sink in pointer context), but it will be permanently remembered as zero
for TYPEDEF and TYPEDEF will be marked as RESOLVED. Eventually ARRAY size will
be resolved correctly, but TYPEDEF resolved_size won't be updated anymore.
This, subsequently, will lead to erroneous map creation failure, if that
TYPEDEF is specified as either key or value, as key_size/value_size won't
correspond to resolved size of TYPEDEF (kernel will believe it's zero).

Note, that if BTF was ordered as [1] ARRAY <- [2] TYPEDEF <- [3] PTR, this
won't be a problem, as by the time we get to TYPEDEF, ARRAY's size is already
calculated and stored.

This bug manifests itself in rejecting BTF-defined maps that use array
typedef as a value type:

typedef int array_t[16];

struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);
    __type(value, array_t); /* i.e., array_t *value; */
} test_map SEC(".maps");

The fix consists on not relying on modifier's resolved_size and instead using
modifier's resolved_id (type ID for "concrete" type to which modifier
eventually resolves) and doing size determination for that resolved type. This
allow to preserve existing "early DFS termination" logic for PTR or
STRUCT_OR_ARRAY contexts, but still do correct size determination for modifier
types.

Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 kernel/bpf/btf.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cad09858a5f2..22fe8b155e51 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 				 !btf_type_is_var(size_type)))
 			return NULL;

-		size = btf->resolved_sizes[size_type_id];
 		size_type_id = btf->resolved_ids[size_type_id];
 		size_type = btf_type_by_id(btf, size_type_id);
 		if (btf_type_nosize_or_null(size_type))
 			return NULL;
+		else if (btf_type_has_size(size_type))
+			size = size_type->size;
+		else if (btf_type_is_array(size_type))
+			size = btf->resolved_sizes[size_type_id];
+		else if (btf_type_is_ptr(size_type))
+			size = sizeof(void *);
+		else
+			return NULL;
 	}

 	*type_id = size_type_id;
@@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	const struct btf_type *next_type;
 	u32 next_type_id = t->type;
 	struct btf *btf = env->btf;
-	u32 next_type_size = 0;

 	next_type = btf_type_by_id(btf, next_type_id);
 	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	 * save us a few type-following when we use it later (e.g. in
 	 * pretty print).
 	 */
-	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+	if (!btf_type_id_size(btf, &next_type_id, NULL)) {
 		if (env_type_is_resolved(env, next_type_id))
 			next_type = btf_type_id_resolve(btf, &next_type_id);

@@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 		}
 	}

-	env_stack_pop_resolved(env, next_type_id, next_type_size);
+	env_stack_pop_resolved(env, next_type_id, 0);

 	return 0;
 }
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH] net/mlx5e: Move priv variable into case statement in mlx5e_setup_tc
From: Nathan Chancellor @ 2019-07-11  6:09 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller,
	Linux Netdev List, RDMA mailing list, linux-kernel,
	clang-built-linux
In-Reply-To: <CALzJLG9Aw=sVPDiewHr+4Jiuaod_1q=10vzMzCUVg-rCCXD6cQ@mail.gmail.com>

On Wed, Jul 10, 2019 at 11:02:00PM -0700, Saeed Mahameed wrote:
> On Wed, Jul 10, 2019 at 12:05 PM Nathan Chancellor
> <natechancellor@gmail.com> wrote:
> >
> > There is an unused variable warning on arm64 defconfig when
> > CONFIG_MLX5_ESWITCH is unset:
> >
> > drivers/net/ethernet/mellanox/mlx5/core/en_main.c:3467:21: warning:
> > unused variable 'priv' [-Wunused-variable]
> >         struct mlx5e_priv *priv = netdev_priv(dev);
> >                            ^
> > 1 warning generated.
> >
> > Move it down into the case statement where it is used.
> >
> > Fixes: 4e95bc268b91 ("net: flow_offload: add flow_block_cb_setup_simple()")
> > Link: https://github.com/ClangBuiltLinux/linux/issues/597
> > Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
> > ---
> >  drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 ++++---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > index 6d0ae87c8ded..651eb714eb5b 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> > @@ -3464,15 +3464,16 @@ static LIST_HEAD(mlx5e_block_cb_list);
> >  static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
> >                           void *type_data)
> >  {
> > -       struct mlx5e_priv *priv = netdev_priv(dev);
> > -
> >         switch (type) {
> >  #ifdef CONFIG_MLX5_ESWITCH
> > -       case TC_SETUP_BLOCK:
> > +       case TC_SETUP_BLOCK: {
> > +               struct mlx5e_priv *priv = netdev_priv(dev);
> > +
> >                 return flow_block_cb_setup_simple(type_data,
> >                                                   &mlx5e_block_cb_list,
> >                                                   mlx5e_setup_tc_block_cb,
> >                                                   priv, priv, true);
> > +       }
> 
> Hi Nathan,
> 
> We have another patch internally that fixes this, and it is already
> queued up in my queue.
> it works differently as we want to pass priv instead of netdev to
> mlx5e_setup_tc_mqprio below,
> which will also solve warning ..
> 
> So i would like to submit that patch if it is ok with you ?

Hi Saeed,

Whatever works best for you, I just care that the warning gets fixed,
not how it is done :) I wouldn't mind being put on CC so I can pick it
up for my local tests.

Thanks for the follow up!
Nathan

^ permalink raw reply

* Re: [PATCH] net/mlx5e: Move priv variable into case statement in mlx5e_setup_tc
From: Saeed Mahameed @ 2019-07-11  6:02 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller,
	Linux Netdev List, RDMA mailing list, linux-kernel,
	clang-built-linux
In-Reply-To: <20190710190502.104010-1-natechancellor@gmail.com>

On Wed, Jul 10, 2019 at 12:05 PM Nathan Chancellor
<natechancellor@gmail.com> wrote:
>
> There is an unused variable warning on arm64 defconfig when
> CONFIG_MLX5_ESWITCH is unset:
>
> drivers/net/ethernet/mellanox/mlx5/core/en_main.c:3467:21: warning:
> unused variable 'priv' [-Wunused-variable]
>         struct mlx5e_priv *priv = netdev_priv(dev);
>                            ^
> 1 warning generated.
>
> Move it down into the case statement where it is used.
>
> Fixes: 4e95bc268b91 ("net: flow_offload: add flow_block_cb_setup_simple()")
> Link: https://github.com/ClangBuiltLinux/linux/issues/597
> Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 6d0ae87c8ded..651eb714eb5b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -3464,15 +3464,16 @@ static LIST_HEAD(mlx5e_block_cb_list);
>  static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
>                           void *type_data)
>  {
> -       struct mlx5e_priv *priv = netdev_priv(dev);
> -
>         switch (type) {
>  #ifdef CONFIG_MLX5_ESWITCH
> -       case TC_SETUP_BLOCK:
> +       case TC_SETUP_BLOCK: {
> +               struct mlx5e_priv *priv = netdev_priv(dev);
> +
>                 return flow_block_cb_setup_simple(type_data,
>                                                   &mlx5e_block_cb_list,
>                                                   mlx5e_setup_tc_block_cb,
>                                                   priv, priv, true);
> +       }

Hi Nathan,

We have another patch internally that fixes this, and it is already
queued up in my queue.
it works differently as we want to pass priv instead of netdev to
mlx5e_setup_tc_mqprio below,
which will also solve warning ..

So i would like to submit that patch if it is ok with you ?

>  #endif
>         case TC_SETUP_QDISC_MQPRIO:
>                 return mlx5e_setup_tc_mqprio(dev, type_data);
> --
> 2.22.0
>

^ permalink raw reply

* Re: Fw: [Bug 204099] New: systemd-networkd fails on 5.2 - same version works on 5.1.16
From: Leon Romanovsky @ 2019-07-11  5:54 UTC (permalink / raw)
  To: David Ahern; +Cc: Stephen Hemminger, netdev, Maxim Mikityanskiy
In-Reply-To: <37ee2993-f81b-6265-87b0-1179162f1a2d@gmail.com>

On Wed, Jul 10, 2019 at 04:43:18PM -0600, David Ahern wrote:
> On 7/9/19 8:43 AM, Stephen Hemminger wrote:
> > Looks like the stricter netlink validation broke userspace.
> > This is bad.

Actually, the initial bug in systemd and it is where it should be fixed.

>
> I believe other reports have traced this to
>
> commit 7dc2bccab0ee37ac28096b8fcdc390a679a15841
> Author: Maxim Mikityanskiy <maximmi@mellanox.com>
> Date:   Tue May 21 06:40:04 2019 +0000
>
>     Validate required parameters in inet6_validate_link_af

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: Leon Romanovsky @ 2019-07-11  5:40 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Jason Gunthorpe, Bernard Metzler, Doug Ledford, David Miller,
	Networking, Linux Next Mailing List, Linux Kernel Mailing List
In-Reply-To: <20190711131603.6b11b831@canb.auug.org.au>

On Thu, Jul 11, 2019 at 01:16:03PM +1000, Stephen Rothwell wrote:
> Hi all,
>
> On Thu, 11 Jul 2019 13:13:44 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> >
> > On Thu, 11 Jul 2019 02:26:27 +0000 Jason Gunthorpe <jgg@mellanox.com> wrote:
> > >
> > > On Thu, Jul 11, 2019 at 11:50:54AM +1000, Stephen Rothwell wrote:
> > >
> > > > So today this failed to build after I merged the rdma tree (previously
> > > > it didn;t until after the net-next tree was merged (I assume a
> > > > dependency changed).  It failed because in_dev_for_each_ifa_rcu (and
> > > > in_dev_for_each_ifa_rtnl) is only defined in a commit in the net-next
> > > > tree :-(
> > >
> > > ? I'm confused..
> > >
> > > rdma.git builds fine stand alone (I hope!)
> >
> > I have "Fixup to build SIW issue" from Leon (which switches to using
> > in_dev_for_each_ifa_rcu) included in the rmda tree merge commit because
> > without that the rdma tree would not build for me.  Are you saying that
> > I don't need that at all, now?
>
> Actually , I get it now, "Fixup to build SIW issue" is really just a
> fixup for the net-next and rdma trees merge ... OK, I will fix that up
> tomorrow.  Sorry for my confusion.

Yes, it was for build only.

>
> --
> Cheers,
> Stephen Rothwell



^ permalink raw reply

* Re: [PATCH v4 2/2] rtw88: pci: Use DMA sync instead of remapping in RX ISR
From: Jian-Hong Pan @ 2019-07-11  5:30 UTC (permalink / raw)
  To: Yan-Hsuan Chuang, Kalle Valo, David S . Miller, Larry Finger,
	David Laight, Christoph Hellwig
  Cc: linux-wireless, Linux Netdev List, Linux Kernel,
	Linux Upstreaming Team, Daniel Drake, stable
In-Reply-To: <20190711052427.5582-2-jian-hong@endlessm.com>

Jian-Hong Pan <jian-hong@endlessm.com> 於 2019年7月11日 週四 下午1:25寫道：
>
> Since each skb in RX ring is reused instead of new allocation, we can
> treat the DMA in a more efficient way by DMA synchronization.
>
> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> Cc: <stable@vger.kernel.org>
> ---

Sorry, also forget to place the version difference here

v2:
 - New patch by following [PATCH v3 1/2] rtw88: pci: Rearrange the
   memory usage for skb in RX ISR.

v3:
 - Remove rtw_pci_sync_rx_desc_cpu and call dma_sync_single_for_cpu in
   rtw_pci_rx_isr directly.
 - Remove the return value of rtw_pci_sync_rx_desc_device.
 - Use DMA_FROM_DEVICE instead of PCI_DMA_FROMDEVICE.

v4:
 - Same as v3.

>  drivers/net/wireless/realtek/rtw88/pci.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
> index c415f5e94fed..68fae52151dd 100644
> --- a/drivers/net/wireless/realtek/rtw88/pci.c
> +++ b/drivers/net/wireless/realtek/rtw88/pci.c
> @@ -206,6 +206,23 @@ static int rtw_pci_reset_rx_desc(struct rtw_dev *rtwdev, struct sk_buff *skb,
>         return 0;
>  }
>
> +static void rtw_pci_sync_rx_desc_device(struct rtw_dev *rtwdev, dma_addr_t dma,
> +                                       struct rtw_pci_rx_ring *rx_ring,
> +                                       u32 idx, u32 desc_sz)
> +{
> +       struct device *dev = rtwdev->dev;
> +       struct rtw_pci_rx_buffer_desc *buf_desc;
> +       int buf_sz = RTK_PCI_RX_BUF_SIZE;
> +
> +       dma_sync_single_for_device(dev, dma, buf_sz, DMA_FROM_DEVICE);
> +
> +       buf_desc = (struct rtw_pci_rx_buffer_desc *)(rx_ring->r.head +
> +                                                    idx * desc_sz);
> +       memset(buf_desc, 0, sizeof(*buf_desc));
> +       buf_desc->buf_size = cpu_to_le16(RTK_PCI_RX_BUF_SIZE);
> +       buf_desc->dma = cpu_to_le32(dma);
> +}
> +
>  static int rtw_pci_init_rx_ring(struct rtw_dev *rtwdev,
>                                 struct rtw_pci_rx_ring *rx_ring,
>                                 u8 desc_size, u32 len)
> @@ -782,8 +799,8 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
>                 rtw_pci_dma_check(rtwdev, ring, cur_rp);
>                 skb = ring->buf[cur_rp];
>                 dma = *((dma_addr_t *)skb->cb);
> -               pci_unmap_single(rtwpci->pdev, dma, RTK_PCI_RX_BUF_SIZE,
> -                                PCI_DMA_FROMDEVICE);
> +               dma_sync_single_for_cpu(rtwdev->dev, dma, RTK_PCI_RX_BUF_SIZE,
> +                                       DMA_FROM_DEVICE);
>                 rx_desc = skb->data;
>                 chip->ops->query_rx_desc(rtwdev, rx_desc, &pkt_stat, &rx_status);
>
> @@ -818,7 +835,8 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
>
>  next_rp:
>                 /* new skb delivered to mac80211, re-enable original skb DMA */
> -               rtw_pci_reset_rx_desc(rtwdev, skb, ring, cur_rp, buf_desc_sz);
> +               rtw_pci_sync_rx_desc_device(rtwdev, dma, ring, cur_rp,
> +                                           buf_desc_sz);
>
>                 /* host read next element in ring */
>                 if (++cur_rp >= ring->r.len)
> --
> 2.22.0
>

^ permalink raw reply

* Re: [PATCH v4 1/2] rtw88: pci: Rearrange the memory usage for skb in RX ISR
From: Jian-Hong Pan @ 2019-07-11  5:28 UTC (permalink / raw)
  To: Yan-Hsuan Chuang, Kalle Valo, David S . Miller, Larry Finger,
	David Laight, Christoph Hellwig
  Cc: linux-wireless, Linux Netdev List, Linux Kernel,
	Linux Upstreaming Team, Daniel Drake, stable
In-Reply-To: <20190711052427.5582-1-jian-hong@endlessm.com>

Jian-Hong Pan <jian-hong@endlessm.com> 於 2019年7月11日 週四 下午1:25寫道：
>
> Testing with RTL8822BE hardware, when available memory is low, we
> frequently see a kernel panic and system freeze.
>
> First, rtw_pci_rx_isr encounters a memory allocation failure (trimmed):
>
> rx routine starvation
> WARNING: CPU: 7 PID: 9871 at drivers/net/wireless/realtek/rtw88/pci.c:822 rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci]
> [ 2356.580313] RIP: 0010:rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci]
>
> Then we see a variety of different error conditions and kernel panics,
> such as this one (trimmed):
>
> rtw_pci 0000:02:00.0: pci bus timeout, check dma status
> skbuff: skb_over_panic: text:00000000091b6e66 len:415 put:415 head:00000000d2880c6f data:000000007a02b1ea tail:0x1df end:0xc0 dev:<NULL>
> ------------[ cut here ]------------
> kernel BUG at net/core/skbuff.c:105!
> invalid opcode: 0000 [#1] SMP NOPTI
> RIP: 0010:skb_panic+0x43/0x45
>
> When skb allocation fails and the "rx routine starvation" is hit, the
> function returns immediately without updating the RX ring. At this
> point, the RX ring may continue referencing an old skb which was already
> handed off to ieee80211_rx_irqsafe(). When it comes to be used again,
> bad things happen.
>
> This patch allocates a new, data-sized skb first in RX ISR. After
> copying the data in, we pass it to the upper layers. However, if skb
> allocation fails, we effectively drop the frame. In both cases, the
> original, full size ring skb is reused.
>
> In addition, to fixing the kernel crash, the RX routine should now
> generally behave better under low memory conditions.
>
> Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=204053
> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> Cc: <stable@vger.kernel.org>
> ---

Sorry, I forget to place the version difference here.

v2:
 - Allocate new data-sized skb and put data into it, then pass it to
   mac80211. Reuse the original skb in RX ring by DMA sync.
 - Modify the commit message.
 - Introduce following [PATCH v3 2/2] rtw88: pci: Use DMA sync instead
   of remapping in RX ISR.

v3:
 - Same as v2.

v4:
 - Fix comment: allocate a new skb for this frame, discard the frame
if none available

>  drivers/net/wireless/realtek/rtw88/pci.c | 49 +++++++++++-------------
>  1 file changed, 22 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
> index cfe05ba7280d..c415f5e94fed 100644
> --- a/drivers/net/wireless/realtek/rtw88/pci.c
> +++ b/drivers/net/wireless/realtek/rtw88/pci.c
> @@ -763,6 +763,7 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
>         u32 pkt_offset;
>         u32 pkt_desc_sz = chip->rx_pkt_desc_sz;
>         u32 buf_desc_sz = chip->rx_buf_desc_sz;
> +       u32 new_len;
>         u8 *rx_desc;
>         dma_addr_t dma;
>
> @@ -790,40 +791,34 @@ static void rtw_pci_rx_isr(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci,
>                 pkt_offset = pkt_desc_sz + pkt_stat.drv_info_sz +
>                              pkt_stat.shift;
>
> -               if (pkt_stat.is_c2h) {
> -                       /* keep rx_desc, halmac needs it */
> -                       skb_put(skb, pkt_stat.pkt_len + pkt_offset);
> +               /* allocate a new skb for this frame,
> +                * discard the frame if none available
> +                */
> +               new_len = pkt_stat.pkt_len + pkt_offset;
> +               new = dev_alloc_skb(new_len);
> +               if (WARN_ONCE(!new, "rx routine starvation\n"))
> +                       goto next_rp;
> +
> +               /* put the DMA data including rx_desc from phy to new skb */
> +               skb_put_data(new, skb->data, new_len);
>
> -                       /* pass offset for further operation */
> -                       *((u32 *)skb->cb) = pkt_offset;
> -                       skb_queue_tail(&rtwdev->c2h_queue, skb);
> +               if (pkt_stat.is_c2h) {
> +                        /* pass rx_desc & offset for further operation */
> +                       *((u32 *)new->cb) = pkt_offset;
> +                       skb_queue_tail(&rtwdev->c2h_queue, new);
>                         ieee80211_queue_work(rtwdev->hw, &rtwdev->c2h_work);
>                 } else {
> -                       /* remove rx_desc, maybe use skb_pull? */
> -                       skb_put(skb, pkt_stat.pkt_len);
> -                       skb_reserve(skb, pkt_offset);
> -
> -                       /* alloc a smaller skb to mac80211 */
> -                       new = dev_alloc_skb(pkt_stat.pkt_len);
> -                       if (!new) {
> -                               new = skb;
> -                       } else {
> -                               skb_put_data(new, skb->data, skb->len);
> -                               dev_kfree_skb_any(skb);
> -                       }
> -                       /* TODO: merge into rx.c */
> -                       rtw_rx_stats(rtwdev, pkt_stat.vif, skb);
> +                       /* remove rx_desc */
> +                       skb_pull(new, pkt_offset);
> +
> +                       rtw_rx_stats(rtwdev, pkt_stat.vif, new);
>                         memcpy(new->cb, &rx_status, sizeof(rx_status));
>                         ieee80211_rx_irqsafe(rtwdev->hw, new);
>                 }
>
> -               /* skb delivered to mac80211, alloc a new one in rx ring */
> -               new = dev_alloc_skb(RTK_PCI_RX_BUF_SIZE);
> -               if (WARN(!new, "rx routine starvation\n"))
> -                       return;
> -
> -               ring->buf[cur_rp] = new;
> -               rtw_pci_reset_rx_desc(rtwdev, new, ring, cur_rp, buf_desc_sz);
> +next_rp:
> +               /* new skb delivered to mac80211, re-enable original skb DMA */
> +               rtw_pci_reset_rx_desc(rtwdev, skb, ring, cur_rp, buf_desc_sz);
>
>                 /* host read next element in ring */
>                 if (++cur_rp >= ring->r.len)
> --
> 2.22.0
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox