Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next iproute2 v2 1/2] devlink: Update kernel header to commit
From: Parav Pandit @ 2019-07-10 12:39 UTC (permalink / raw)
  To: netdev; +Cc: stephen, dsahern, jiri, Parav Pandit
In-Reply-To: <20190701183017.25407-1-parav@mellanox.com>

Update kernel header to commit:
e41b6bf3cdd4 ("devlink: Introduce PCI VF port flavour and port attribute")

Signed-off-by: Parav Pandit <parav@mellanox.com>
---
 include/uapi/linux/devlink.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6544824a..fc195cbd 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -169,6 +169,14 @@ enum devlink_port_flavour {
 	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
 				   * interconnect port.
 				   */
+	DEVLINK_PORT_FLAVOUR_PCI_PF, /* Represents eswitch port for
+				      * the PCI PF. It is an internal
+				      * port that faces the PCI PF.
+				      */
+	DEVLINK_PORT_FLAVOUR_PCI_VF, /* Represents eswitch port
+				      * for the PCI VF. It is an internal
+				      * port that faces the PCI VF.
+				      */
 };
 
 enum devlink_param_cmode {
@@ -337,6 +345,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,	/* u64 */
 	DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,	/* u64 */
 
+	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
+	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
-- 
2.19.2


^ permalink raw reply related

* [PATCH net-next iproute2 v2 2/2] devlink: Introduce PCI PF and VF port flavour and attribute
From: Parav Pandit @ 2019-07-10 12:39 UTC (permalink / raw)
  To: netdev; +Cc: stephen, dsahern, jiri, Parav Pandit
In-Reply-To: <20190710123952.6877-1-parav@mellanox.com>

Introduce PCI PF and VF port flavour and port attributes such as PF
number and VF number.

$ devlink port show
pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0
pci/0000:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
pci/0000:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1

Signed-off-by: Parav Pandit <parav@mellanox.com>
---
Changelog:
v1->v2:
 - Instead of if-else using switch-case.
 - Split patch to two patches to have kernel header update in dedicated
   patch.
---
 devlink/devlink.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index ac8c0fb1..d8197ea3 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -2794,11 +2794,29 @@ static const char *port_flavour_name(uint16_t flavour)
 		return "cpu";
 	case DEVLINK_PORT_FLAVOUR_DSA:
 		return "dsa";
+	case DEVLINK_PORT_FLAVOUR_PCI_PF:
+		return "pcipf";
+	case DEVLINK_PORT_FLAVOUR_PCI_VF:
+		return "pcivf";
 	default:
 		return "<unknown flavour>";
 	}
 }
 
+static void pr_out_port_pfvf_num(struct dl *dl, struct nlattr **tb)
+{
+	uint16_t fn_num;
+
+	if (tb[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
+		fn_num = mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
+		pr_out_uint(dl, "pfnum", fn_num);
+	}
+	if (tb[DEVLINK_ATTR_PORT_PCI_VF_NUMBER]) {
+		fn_num = mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_PCI_VF_NUMBER]);
+		pr_out_uint(dl, "vfnum", fn_num);
+	}
+}
+
 static void pr_out_port(struct dl *dl, struct nlattr **tb)
 {
 	struct nlattr *pt_attr = tb[DEVLINK_ATTR_PORT_TYPE];
@@ -2828,6 +2846,15 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb)
 				mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_FLAVOUR]);
 
 		pr_out_str(dl, "flavour", port_flavour_name(port_flavour));
+
+		switch (port_flavour) {
+		case DEVLINK_PORT_FLAVOUR_PCI_PF:
+		case DEVLINK_PORT_FLAVOUR_PCI_VF:
+			pr_out_port_pfvf_num(dl, tb);
+			break;
+		default:
+			break;
+		}
 	}
 	if (tb[DEVLINK_ATTR_PORT_NUMBER]) {
 		uint32_t port_number;
-- 
2.19.2


^ permalink raw reply related

* [PATCH iproute2-next v2 0/3] add interface to TC MPLS actions
From: John Hurley @ 2019-07-10 12:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, jiri, xiyou.wangcong, dsahern, willemdebruijn.kernel,
	stephen, simon.horman, jakub.kicinski, oss-drivers, John Hurley

Recent kernel additions to TC allows the manipulation of MPLS headers as
filter actions.

The following patchset creates an iproute2 interface to the new actions
and includes documentation on how to use it.

v1->v2:
- change error from print_string() to fprintf(strerr,) (Stephen Hemminger)
- split long line in explain() message (David Ahern)
- use _SL_ instead of /n in print message (David Ahern)

John Hurley (3):
  lib: add mpls_uc and mpls_mc as link layer protocol names
  tc: add mpls actions
  man: update man pages for TC MPLS actions

 lib/ll_proto.c     |   2 +
 man/man8/tc-mpls.8 | 156 ++++++++++++++++++++++++++++++
 tc/Makefile        |   1 +
 tc/m_mpls.c        | 276 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 435 insertions(+)
 create mode 100644 man/man8/tc-mpls.8
 create mode 100644 tc/m_mpls.c

-- 
2.7.4


^ permalink raw reply

* [PATCH iproute2-next v2 1/3] lib: add mpls_uc and mpls_mc as link layer protocol names
From: John Hurley @ 2019-07-10 12:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, jiri, xiyou.wangcong, dsahern, willemdebruijn.kernel,
	stephen, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1562762440-25656-1-git-send-email-john.hurley@netronome.com>

Update the llproto_names array to allow users to reference the mpls
protocol ids with the names 'mpls_uc' for unicast MPLS and 'mpls_mc' for
multicast.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 lib/ll_proto.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/ll_proto.c b/lib/ll_proto.c
index 78c3961..2a0c1cb 100644
--- a/lib/ll_proto.c
+++ b/lib/ll_proto.c
@@ -78,6 +78,8 @@ __PF(TIPC,tipc)
 __PF(AOE,aoe)
 __PF(8021Q,802.1Q)
 __PF(8021AD,802.1ad)
+__PF(MPLS_UC,mpls_uc)
+__PF(MPLS_MC,mpls_mc)
 
 { 0x8100, "802.1Q" },
 { 0x88cc, "LLDP" },
-- 
2.7.4


^ permalink raw reply related

* [PATCH iproute2-next v2 2/3] tc: add mpls actions
From: John Hurley @ 2019-07-10 12:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, jiri, xiyou.wangcong, dsahern, willemdebruijn.kernel,
	stephen, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1562762440-25656-1-git-send-email-john.hurley@netronome.com>

Create a new action type for TC that allows the pushing, popping, and
modifying of MPLS headers.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 tc/Makefile |   1 +
 tc/m_mpls.c | 276 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 277 insertions(+)
 create mode 100644 tc/m_mpls.c

diff --git a/tc/Makefile b/tc/Makefile
index 60abdde..09ff369 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -39,6 +39,7 @@ TCMODULES += q_drr.o
 TCMODULES += q_qfq.o
 TCMODULES += m_gact.o
 TCMODULES += m_mirred.o
+TCMODULES += m_mpls.o
 TCMODULES += m_nat.o
 TCMODULES += m_pedit.o
 TCMODULES += m_ife.o
diff --git a/tc/m_mpls.c b/tc/m_mpls.c
new file mode 100644
index 0000000..0ae6d62
--- /dev/null
+++ b/tc/m_mpls.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/if_ether.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "rt_names.h"
+#include "tc_util.h"
+
+static const char * const action_names[] = {
+	[TCA_MPLS_ACT_POP] = "pop",
+	[TCA_MPLS_ACT_PUSH] = "push",
+	[TCA_MPLS_ACT_MODIFY] = "modify",
+	[TCA_MPLS_ACT_DEC_TTL] = "dec_ttl",
+};
+
+static void explain(void)
+{
+	fprintf(stderr,
+		"Usage: mpls pop [ protocol MPLS_PROTO ]\n"
+		"       mpls push [ protocol MPLS_PROTO ] [ label MPLS_LABEL ] [ tc MPLS_TC ]\n"
+		"                 [ ttl MPLS_TTL ] [ bos MPLS_BOS ] [CONTROL]\n"
+		"       mpls modify [ label MPLS_LABEL ] [ tc MPLS_TC ] [ ttl MPLS_TTL ] [CONTROL]\n"
+		"           for pop MPLS_PROTO is next header of packet - e.g. ip or mpls_uc\n"
+		"           for push MPLS_PROTO is one of mpls_uc or mpls_mc\n"
+		"               with default: mpls_uc\n"
+		"       CONTROL := reclassify | pipe | drop | continue | pass |\n"
+		"                  goto chain <CHAIN_INDEX>\n");
+}
+
+static void usage(void)
+{
+	explain();
+	exit(-1);
+}
+
+static bool can_modify_mpls_fields(unsigned int action)
+{
+	return action == TCA_MPLS_ACT_PUSH || action == TCA_MPLS_ACT_MODIFY;
+}
+
+static bool can_modify_ethtype(unsigned int action)
+{
+	return action == TCA_MPLS_ACT_PUSH || action == TCA_MPLS_ACT_POP;
+}
+
+static bool is_valid_label(__u32 label)
+{
+	return label <= 0xfffff;
+}
+
+static bool check_double_action(unsigned int action, const char *arg)
+{
+	if (!action)
+		return false;
+
+	fprintf(stderr,
+		"Error: got \"%s\" but action already set to \"%s\"\n",
+		arg, action_names[action]);
+	explain();
+	return true;
+}
+
+static int parse_mpls(struct action_util *a, int *argc_p, char ***argv_p,
+		      int tca_id, struct nlmsghdr *n)
+{
+	struct tc_mpls parm = {};
+	__u32 label = 0xffffffff;
+	unsigned int action = 0;
+	char **argv = *argv_p;
+	struct rtattr *tail;
+	int argc = *argc_p;
+	__u16 proto = 0;
+	__u8 bos = 0xff;
+	__u8 tc = 0xff;
+	__u8 ttl = 0;
+
+	if (matches(*argv, "mpls") != 0)
+		return -1;
+
+	NEXT_ARG();
+
+	while (argc > 0) {
+		if (matches(*argv, "pop") == 0) {
+			if (check_double_action(action, *argv))
+				return -1;
+			action = TCA_MPLS_ACT_POP;
+		} else if (matches(*argv, "push") == 0) {
+			if (check_double_action(action, *argv))
+				return -1;
+			action = TCA_MPLS_ACT_PUSH;
+		} else if (matches(*argv, "modify") == 0) {
+			if (check_double_action(action, *argv))
+				return -1;
+			action = TCA_MPLS_ACT_MODIFY;
+		} else if (matches(*argv, "dec_ttl") == 0) {
+			if (check_double_action(action, *argv))
+				return -1;
+			action = TCA_MPLS_ACT_DEC_TTL;
+		} else if (matches(*argv, "label") == 0) {
+			if (!can_modify_mpls_fields(action))
+				invarg("only valid for push/modify", *argv);
+			NEXT_ARG();
+			if (get_u32(&label, *argv, 0) || !is_valid_label(label))
+				invarg("label must be <=0xFFFFF", *argv);
+		} else if (matches(*argv, "tc") == 0) {
+			if (!can_modify_mpls_fields(action))
+				invarg("only valid for push/modify", *argv);
+			NEXT_ARG();
+			if (get_u8(&tc, *argv, 0) || (tc & ~0x7))
+				invarg("tc field is 3 bits max", *argv);
+		} else if (matches(*argv, "ttl") == 0) {
+			if (!can_modify_mpls_fields(action))
+				invarg("only valid for push/modify", *argv);
+			NEXT_ARG();
+			if (get_u8(&ttl, *argv, 0) || !ttl)
+				invarg("ttl must be >0 and <=255", *argv);
+		} else if (matches(*argv, "bos") == 0) {
+			if (!can_modify_mpls_fields(action))
+				invarg("only valid for push/modify", *argv);
+			NEXT_ARG();
+			if (get_u8(&bos, *argv, 0) || (bos & ~0x1))
+				invarg("bos must be 0 or 1", *argv);
+		} else if (matches(*argv, "protocol") == 0) {
+			if (!can_modify_ethtype(action))
+				invarg("only valid for push/pop", *argv);
+			NEXT_ARG();
+			if (ll_proto_a2n(&proto, *argv))
+				invarg("protocol is invalid", *argv);
+		} else if (matches(*argv, "help") == 0) {
+			usage();
+		} else {
+			break;
+		}
+
+		NEXT_ARG_FWD();
+	}
+
+	if (!action)
+		incomplete_command();
+
+	parse_action_control_dflt(&argc, &argv, &parm.action,
+				  false, TC_ACT_PIPE);
+
+	if (argc) {
+		if (matches(*argv, "index") == 0) {
+			NEXT_ARG();
+			if (get_u32(&parm.index, *argv, 10))
+				invarg("illegal index", *argv);
+			NEXT_ARG_FWD();
+		}
+	}
+
+	if (action == TCA_MPLS_ACT_PUSH && !label)
+		missarg("label");
+
+	if (action == TCA_MPLS_ACT_PUSH && proto &&
+	    proto != htons(ETH_P_MPLS_UC) && proto != htons(ETH_P_MPLS_MC)) {
+		fprintf(stderr,
+			"invalid push protocol \"0x%04x\" - use mpls_(uc|mc)\n",
+			ntohs(proto));
+		return -1;
+	}
+
+	if (action == TCA_MPLS_ACT_POP && !proto)
+		missarg("protocol");
+
+	parm.m_action = action;
+	tail = addattr_nest(n, MAX_MSG, tca_id | NLA_F_NESTED);
+	addattr_l(n, MAX_MSG, TCA_MPLS_PARMS, &parm, sizeof(parm));
+	if (label != 0xffffffff)
+		addattr_l(n, MAX_MSG, TCA_MPLS_LABEL, &label, sizeof(label));
+	if (proto)
+		addattr_l(n, MAX_MSG, TCA_MPLS_PROTO, &proto, sizeof(proto));
+	if (tc != 0xff)
+		addattr8(n, MAX_MSG, TCA_MPLS_TC, tc);
+	if (ttl)
+		addattr8(n, MAX_MSG, TCA_MPLS_TTL, ttl);
+	if (bos != 0xff)
+		addattr8(n, MAX_MSG, TCA_MPLS_BOS, bos);
+	addattr_nest_end(n, tail);
+
+	*argc_p = argc;
+	*argv_p = argv;
+	return 0;
+}
+
+static int print_mpls(struct action_util *au, FILE *f, struct rtattr *arg)
+{
+	struct rtattr *tb[TCA_MPLS_MAX + 1];
+	struct tc_mpls *parm;
+	SPRINT_BUF(b1);
+	__u32 val;
+
+	if (!arg)
+		return -1;
+
+	parse_rtattr_nested(tb, TCA_MPLS_MAX, arg);
+
+	if (!tb[TCA_MPLS_PARMS]) {
+		fprintf(stderr, "[NULL mpls parameters]\n");
+		return -1;
+	}
+	parm = RTA_DATA(tb[TCA_MPLS_PARMS]);
+
+	print_string(PRINT_ANY, "kind", "%s ", "mpls");
+	print_string(PRINT_ANY, "mpls_action", " %s",
+		     action_names[parm->m_action]);
+
+	switch (parm->m_action) {
+	case TCA_MPLS_ACT_POP:
+		if (tb[TCA_MPLS_PROTO]) {
+			__u16 proto;
+
+			proto = rta_getattr_u16(tb[TCA_MPLS_PROTO]);
+			print_string(PRINT_ANY, "protocol", " protocol %s",
+				     ll_proto_n2a(proto, b1, sizeof(b1)));
+		}
+		break;
+	case TCA_MPLS_ACT_PUSH:
+		if (tb[TCA_MPLS_PROTO]) {
+			__u16 proto;
+
+			proto = rta_getattr_u16(tb[TCA_MPLS_PROTO]);
+			print_string(PRINT_ANY, "protocol", " protocol %s",
+				     ll_proto_n2a(proto, b1, sizeof(b1)));
+		}
+		/* Fallthrough */
+	case TCA_MPLS_ACT_MODIFY:
+		if (tb[TCA_MPLS_LABEL]) {
+			val = rta_getattr_u32(tb[TCA_MPLS_LABEL]);
+			print_uint(PRINT_ANY, "label", " label %u", val);
+		}
+		if (tb[TCA_MPLS_TC]) {
+			val = rta_getattr_u8(tb[TCA_MPLS_TC]);
+			print_uint(PRINT_ANY, "tc", " tc %u", val);
+		}
+		if (tb[TCA_MPLS_BOS]) {
+			val = rta_getattr_u8(tb[TCA_MPLS_BOS]);
+			print_uint(PRINT_ANY, "bos", " bos %u", val);
+		}
+		if (tb[TCA_MPLS_TTL]) {
+			val = rta_getattr_u8(tb[TCA_MPLS_TTL]);
+			print_uint(PRINT_ANY, "ttl", " ttl %u", val);
+		}
+		break;
+	}
+	print_action_control(f, " ", parm->action, "");
+
+	print_uint(PRINT_ANY, "index", "\n\t index %u", parm->index);
+	print_int(PRINT_ANY, "ref", " ref %d", parm->refcnt);
+	print_int(PRINT_ANY, "bind", " bind %d", parm->bindcnt);
+
+	if (show_stats) {
+		if (tb[TCA_MPLS_TM]) {
+			struct tcf_t *tm = RTA_DATA(tb[TCA_MPLS_TM]);
+
+			print_tm(f, tm);
+		}
+	}
+
+	print_string(PRINT_FP, NULL, "%s", _SL_);
+
+	return 0;
+}
+
+struct action_util mpls_action_util = {
+	.id = "mpls",
+	.parse_aopt = parse_mpls,
+	.print_aopt = print_mpls,
+};
-- 
2.7.4


^ permalink raw reply related

* [PATCH iproute2-next v2 3/3] man: update man pages for TC MPLS actions
From: John Hurley @ 2019-07-10 12:40 UTC (permalink / raw)
  To: netdev
  Cc: davem, jiri, xiyou.wangcong, dsahern, willemdebruijn.kernel,
	stephen, simon.horman, jakub.kicinski, oss-drivers, John Hurley
In-Reply-To: <1562762440-25656-1-git-send-email-john.hurley@netronome.com>

Add a man page describing the newly added TC mpls manipulation actions.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 man/man8/tc-mpls.8 | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 man/man8/tc-mpls.8

diff --git a/man/man8/tc-mpls.8 b/man/man8/tc-mpls.8
new file mode 100644
index 0000000..84ef2ef
--- /dev/null
+++ b/man/man8/tc-mpls.8
@@ -0,0 +1,156 @@
+.TH "MPLS manipulation action in tc" 8 "22 May 2019" "iproute2" "Linux"
+
+.SH NAME
+mpls - mpls manipulation module
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action mpls" " { "
+.IR POP " | " PUSH " | " MODIFY " | "
+.BR dec_ttl " } [ "
+.IR CONTROL " ]"
+
+.ti -8
+.IR POP " := "
+.BR pop " " protocol
+.IR MPLS_PROTO
+
+.ti -8
+.IR PUSH " := "
+.BR push " [ " protocol
+.IR MPLS_PROTO " ]"
+.RB " [ " tc
+.IR MPLS_TC " ] "
+.RB " [ " ttl
+.IR MPLS_TTL " ] "
+.RB " [ " bos
+.IR MPLS_BOS " ] "
+.BI label " MPLS_LABEL"
+
+.ti -8
+.IR MODIFY " := "
+.BR modify " [ " label
+.IR MPLS_LABEL " ]"
+.RB " [ " tc
+.IR MPLS_TC " ] "
+.RB " [ " ttl
+.IR MPLS_TTL " ] "
+
+.ti -8
+.IR CONTROL " := { "
+.BR reclassify " | " pipe " | " drop " | " continue " | " pass " | " goto " " chain " " CHAIN_INDEX " }"
+.SH DESCRIPTION
+The
+.B mpls
+action performs mpls encapsulation or decapsulation on a packet, reflected by the
+operation modes
+.IR POP ", " PUSH ", " MODIFY " and " DEC_TTL .
+The
+.I POP
+mode requires the ethertype of the header that follows the MPLS header (e.g.
+IPv4 or another MPLS). It will remove the outer MPLS header and replace the
+ethertype in the MAC header with that passed. The
+.IR PUSH " and " MODIFY
+modes update the current MPLS header information or add a new header.
+.IR PUSH
+requires at least an
+.IR MPLS_LABEL ". "
+.I DEC_TTL
+requires no arguments and simply subtracts 1 from the MPLS header TTL field.
+
+.SH OPTIONS
+.TP
+.B pop
+Decapsulation mode. Requires the protocol of the next header.
+.TP
+.B push
+Encapsulation mode. Requires at least the
+.B label
+option.
+.TP
+.B modify
+Replace mode. Existing MPLS tag is replaced.
+.BR label ", "
+.BR tc ", "
+and
+.B ttl
+are all optional.
+.TP
+.B dec_ttl
+Decrement the TTL field on the outer most MPLS header.
+.TP
+.BI label " MPLS_LABEL"
+Specify the MPLS LABEL for the outer MPLS header.
+.I MPLS_LABEL
+is an unsigned 20bit integer, the format is detected automatically (e.g. prefix
+with
+.RB ' 0x '
+for hexadecimal interpretation, etc.).
+.TP
+.BI protocol " MPLS_PROTO"
+Choose the protocol to use. For push actions this must be
+.BR mpls_uc " or " mpls_mc " (" mpls_uc
+is the default). For pop actions it should be the protocol of the next header.
+This option cannot be used with modify.
+.TP
+.BI tc " MPLS_TC"
+Choose the TC value for the outer MPLS header. Decimal number in range of 0-7.
+Defaults to 0.
+.TP
+.BI ttl " MPLS_TTL"
+Choose the TTL value for the outer MPLS header. Number in range of 0-255. A
+non-zero default value will be selected if this is not explicitly set.
+.TP
+.BI bos " MPLS_BOS"
+Manually configure the bottom of stack bit for an MPLS header push. The default
+is for TC to automatically set (or unset) the bit based on the next header of
+the packet.
+.TP
+.I CONTROL
+How to continue after executing this action.
+.RS
+.TP
+.B reclassify
+Restarts classification by jumping back to the first filter attached to this
+action's parent.
+.TP
+.B pipe
+Continue with the next action, this is the default.
+.TP
+.B drop
+Packet will be dropped without running further actions.
+.TP
+.B continue
+Continue classification with next filter in line.
+.TP
+.B pass
+Return to calling qdisc for packet processing. This ends the classification
+process.
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming IP packets on eth0 into MPLS with
+a label 123 and sends them out eth1:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev eth0 protocol ip parent ffff: flower \\
+	action mpls push protocol mpls_uc label 123  \\
+	action mirred egress redirect dev eth1
+.EE
+.RE
+
+In this example, incoming MPLS unicast packets on eth0 are decapsulated and to
+ip packets and output to eth1:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev eth0 protocol mpls_uc parent ffff: flower \\
+	action mpls pop protocol ipv4  \\
+	action mirred egress redirect dev eth0
+.EE
+.RE
+
+.SH SEE ALSO
+.BR tc (8)
-- 
2.7.4


^ permalink raw reply related

* RE: Question about linux kernel commit: "net/ipv6: move metrics from dst to rt6_info"
From: Jan Szewczyk @ 2019-07-10 12:59 UTC (permalink / raw)
  To: David Ahern, davem@davemloft.net; +Cc: netdev@vger.kernel.org
In-Reply-To: <cb0674df-8593-f14b-f680-ce278042c88c@gmail.com>

Hi!
I digged up a little further and maybe it's not a problem with MTU itself. I checked every entry I get from RTM_GETROUTE netlink message and after triggering "too big packet" by pinging ipv6address I get exactly the same messages on 4.12 and 4.18, except that the one with that pinged ipv6address is missing on 4.18 at all. What is weird - it's visible when running "ip route get to ipv6address". Do you know why there is a mismatch there?

It's not easy for me to check this behavior on 4.19, because we have a pretty complex system here, but maybe I could try to reproduce it locally on some virtual box and check if it behaves the same.

Thanks for the tip about the testing tools, I'll try to use them.

BR,
Jan Szewczyk

-----Original Message-----
From: David Ahern <dsahern@gmail.com> 
Sent: Wednesday, July 10, 2019 14:28
To: Jan Szewczyk <jan.szewczyk@ericsson.com>; davem@davemloft.net
Cc: netdev@vger.kernel.org
Subject: Re: Question about linux kernel commit: "net/ipv6: move metrics from dst to rt6_info"

[ adding netdev so others can chime in ]

On 7/10/19 2:28 AM, Jan Szewczyk wrote:
> Hi guys!
> 
> We can see different behavior of one of our commands that supposed to 
> show pmtu information.
> 
> It's using netlink message RTM_GETROUTE to get the information and in 
> Linux kernel version 4.12 after sending big packet (and triggering 
> "packet too big") there is an entry with PMTU and expiration time.
> 
> In the version 4.18 unfortunately the entry looks different and there 
> is no PMTU information.

Can you try with 4.19.58 (latest stable release for 4.19)? Perhaps there was a bugfix that is missing from 4.18.

The kernel has 2 commands under tools/testing/selftests/net -- pmtu.sh and icmp_redirect.sh -- that verify exceptions are created and use 'ip ro get' to verify the mtu.

> 
> I can see that in your commit
> https://protect2.fireeye.com/url?k=5be21a17-07361dbb-5be25a8c-8667c4af
> e13e-f99413291ecbed59&q=1&u=https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinu
> x%2Fcommit%2Fd4ead6b34b67fd711639324b6465a050bcb197d4,
> these lines disappeared from route.c:
> 
>  
> 
>      if (rt->rt6i_pmtu)
> 
>            metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
> 
>  
> 
> I'm very beginner in linux kernel code, can you help me and tell me if 
> that could cause this different behavior?
> 
>  
> 
>  
> 
> BR,
> 
> Jan Szewczyk
> 

^ permalink raw reply

* Re: [PATCH net-next v6 06/15] ethtool: netlink bitset handling
From: Jiri Pirko @ 2019-07-10 12:59 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: netdev, David Miller, Jakub Kicinski, Andrew Lunn,
	Florian Fainelli, John Linville, Stephen Hemminger, Johannes Berg,
	linux-kernel
In-Reply-To: <20190710123803.GB5700@unicorn.suse.cz>

Wed, Jul 10, 2019 at 02:38:03PM CEST, mkubecek@suse.cz wrote:
>On Tue, Jul 09, 2019 at 04:18:17PM +0200, Jiri Pirko wrote:
>> 
>> I understand. So how about avoid the bitfield all together and just
>> have array of either bits of strings or combinations?
>> 
>> ETHTOOL_CMD_SETTINGS_SET (U->K)
>>     ETHTOOL_A_HEADER
>>         ETHTOOL_A_DEV_NAME = "eth3"
>>     ETHTOOL_A_SETTINGS_PRIV_FLAGS
>>        ETHTOOL_A_SETTINGS_PRIV_FLAG
>>            ETHTOOL_A_FLAG_NAME = "legacy-rx"
>> 	   ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>> 
>> or the same with index instead of string
>> 
>> ETHTOOL_CMD_SETTINGS_SET (U->K)
>>     ETHTOOL_A_HEADER
>>         ETHTOOL_A_DEV_NAME = "eth3"
>>     ETHTOOL_A_SETTINGS_PRIV_FLAGS
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 0
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>> 
>> 
>> For set you can combine both when you want to set multiple bits:
>> 
>> ETHTOOL_CMD_SETTINGS_SET (U->K)
>>     ETHTOOL_A_HEADER
>>         ETHTOOL_A_DEV_NAME = "eth3"
>>     ETHTOOL_A_SETTINGS_PRIV_FLAGS
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 2
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 8
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_NAME = "legacy-rx"
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>> 
>> 
>> For get this might be a bit bigger message:
>> 
>> ETHTOOL_CMD_SETTINGS_GET_REPLY (K->U)
>>     ETHTOOL_A_HEADER
>>         ETHTOOL_A_DEV_NAME = "eth3"
>>     ETHTOOL_A_SETTINGS_PRIV_FLAGS
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 0
>>             ETHTOOL_A_FLAG_NAME = "legacy-rx"
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 1
>>             ETHTOOL_A_FLAG_NAME = "vf-ipsec"
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>>         ETHTOOL_A_SETTINGS_PRIV_FLAG
>>             ETHTOOL_A_FLAG_INDEX = 8
>>             ETHTOOL_A_FLAG_NAME = "something-else"
>>  	    ETHTOOL_A_FLAG_VALUE   (NLA_FLAG)
>
>This is perfect for "one shot" applications but not so much for long
>running ones, either "ethtool --monitor" or management or monitoring
>daemons. Repeating the names in every notification message would be
>a waste, it's much more convenient to load the strings only once and

Yeah, for those aplications, the ETHTOOL_A_FLAG_NAME could be omitted


>cache them. Even if we omit the names in notifications (and possibly the
>GET replies if client opts for it), this format still takes 12-16 bytes
>per bit.
>
>So the problem I'm trying to address is that there are two types of
>clients with very different mode of work and different preferences.
>
>Looking at the bitset.c, I would rather say that most of the complexity
>and ugliness comes from dealing with both unsigned long based bitmaps
>and u32 based ones. Originally, there were functions working with
>unsigned long based bitmaps and the variants with "32" suffix were
>wrappers around them which converted u32 bitmaps to unsigned long ones
>and back. This became a problem when kernel started issuing warnings
>about variable length arrays as getting rid of them meant two kmalloc()
>and two kfree() for each u32 bitmap operation, even if most of the
>bitmaps are in rather short in practice.
>
>Maybe the wrapper could do something like
>
>int ethnl_put_bitset32(const u32 *value, const u32 *mask,
>		       unsigned int size,  ...)
>{
>	unsigned long fixed_value[2], fixed_mask[2];
>	unsigned long *tmp_value = fixed_value;
>	unsigned long *tmp_mask = fixed_mask;
>
>	if (size > sizeof(fixed_value) * BITS_PER_BYTE) {
>		tmp_value = bitmap_alloc(size);
>		if (!tmp_value)
>			return -ENOMEM;
>		tmp_mask = bitmap_alloc(size);
>		if (!tmp_mask) {
>			kfree(tmp_value);
>			return -ENOMEM;
>		}
>	}
>
>	bitmap_from_arr32(tmp_value, value, size);
>	bitmap_from_arr32(tmp_mask, mask, size);
>	ret = ethnl_put_bitset(tmp_value, tmp_mask, size, ...);
>}
>
>This way we would make bitset.c code cleaner while avoiding allocating
>short bitmaps (which is the most common case). 

I'm primarily concerned about the uapi. Plus if the uapi approach is united
for both index and string, we can omit this whole bitset abomination...


>
>Michal

^ permalink raw reply

* [PATCH] [net-next] net/mlx5e: avoid uninitialized variable use
From: Arnd Bergmann @ 2019-07-10 13:06 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, David S. Miller
  Cc: Arnd Bergmann, Tariq Toukan, Eran Ben Elisha, Boris Pismenny,
	netdev, linux-rdma, linux-kernel, clang-built-linux

clang points to a variable being used in an unexpected
code path:

drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c:251:2: warning: variable 'rec_seq_sz' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized]
        default:
        ^~~~~~~
drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c:255:46: note: uninitialized use occurs here
        skip_static_post = !memcmp(rec_seq, &rn_be, rec_seq_sz);
                                                    ^~~~~~~~~~

From looking at the function logic, it seems that there is no
sensible way to continue here, so just return early and hope
for the best.

Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index 3f5f4317a22b..5c08891806f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -250,6 +250,7 @@ tx_post_resync_params(struct mlx5e_txqsq *sq,
 	}
 	default:
 		WARN_ON(1);
+		return;
 	}
 
 	skip_static_post = !memcmp(rec_seq, &rn_be, rec_seq_sz);
-- 
2.20.0


^ permalink raw reply related

* RE: [PATCH] tipc: ensure skb->lock is initialised
From: Jon Maloy @ 2019-07-10 13:10 UTC (permalink / raw)
  To: Eric Dumazet, Chris Packham, ying.xue@windriver.com,
	davem@davemloft.net
  Cc: netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org
In-Reply-To: <e7606e76-8a0a-dab7-4561-f44f98d90164@gmail.com>



> -----Original Message-----
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Sent: 10-Jul-19 04:00
> To: Jon Maloy <jon.maloy@ericsson.com>; Eric Dumazet
> <eric.dumazet@gmail.com>; Chris Packham
> <Chris.Packham@alliedtelesis.co.nz>; ying.xue@windriver.com;
> davem@davemloft.net
> Cc: netdev@vger.kernel.org; tipc-discussion@lists.sourceforge.net; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH] tipc: ensure skb->lock is initialised
> 
> 
> 
> On 7/9/19 10:15 PM, Jon Maloy wrote:
> >
> > It is not only for lockdep purposes, -it is essential.  But please provide details
> about where you see that more fixes are needed.
> >
> 
> Simple fact that you detect a problem only when skb_queue_purge() is called
> should talk by itself.
> 
> As I stated, there are many places where the list is manipulated _without_ its
> spinlock being held.

Yes, and that is the way it should be on the send path.

> 
> You want consistency, then
> 
> - grab the spinlock all the time.
> - Or do not ever use it.

That is exactly what we are doing. 
- The send path doesn't need the spinlock, and never grabs it.
- The receive path does need it, and always grabs it.

However, since we don't know from the beginning which path a created message will follow, we initialize the queue spinlock "just in case" when it is created, even though it may never be used later.
You can see this as a violation of the principle you are stating above, but it is a prize that is worth paying, given savings in code volume, complexity and performance.

> 
> Do not initialize the spinlock just in case a path will use skb_queue_purge()
> (instead of using __skb_queue_purge())

I am ok with that. I think we can agree that Chris goes for that solution, so we can get this bug fixed.

///jon



^ permalink raw reply

* [PATCH ipsec] xfrm interface: fix list corruption for x-netns
From: Nicolas Dichtel @ 2019-07-10 13:11 UTC (permalink / raw)
  To: steffen.klassert, davem; +Cc: netdev, Nicolas Dichtel, Julien Floret

dev_net(dev) is the netns of the device and xi->net is the link netns,
where the device has been linked.
changelink() must operate in the link netns to avoid a corruption of
the xfrm lists.

Note that xi->net and dev_net(xi->physdev) are always the same.

Before the patch, the xfrmi lists may be corrupted and can later trigger a
kernel panic.

Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces")
Reported-by: Julien Floret <julien.floret@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Tested-by: Julien Floret <julien.floret@6wind.com>
---
 net/xfrm/xfrm_interface.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index a60d391f7ebe..9c5bc8dcf608 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -503,7 +503,7 @@ static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
 
 static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
 {
-	struct net *net = dev_net(xi->dev);
+	struct net *net = xi->net;
 	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
 	int err;
 
@@ -663,9 +663,9 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
 			   struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
 {
-	struct net *net = dev_net(dev);
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net *net = xi->net;
 	struct xfrm_if_parms p;
-	struct xfrm_if *xi;
 
 	xfrmi_netlink_parms(data, &p);
 	xi = xfrmi_locate(net, &p);
@@ -707,7 +707,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev)
 {
 	struct xfrm_if *xi = netdev_priv(dev);
 
-	return dev_net(xi->phydev);
+	return xi->net;
 }
 
 static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
-- 
2.21.0


^ permalink raw reply related

* [PATCH] ipv6: Use ipv6_authlen for len
From: yangxingwu @ 2019-07-10 13:14 UTC (permalink / raw)
  To: davem
  Cc: kuznet, yoshfuji, netdev, linux-kernel, pablo, kadlec, fw,
	netfilter-devel, coreteam, yangxingwu

The length of AH header is computed manually as (hp->hdrlen+2)<<2.
However, in include/linux/ipv6.h, a macro named ipv6_authlen is
already defined for exactly the same job. This commit replaces
the manual computation code with the macro.

Signed-off-by: yangxingwu <xingwu.yang@gmail.com>
---
 net/ipv6/ah6.c                          | 4 ++--
 net/ipv6/exthdrs_core.c                 | 2 +-
 net/ipv6/ip6_tunnel.c                   | 2 +-
 net/ipv6/netfilter/ip6t_ah.c            | 2 +-
 net/ipv6/netfilter/ip6t_ipv6header.c    | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
 net/ipv6/netfilter/nf_log_ipv6.c        | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 68b9e92..626c64b 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -464,7 +464,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	struct ah_data *ahp = x->data;
 	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
 	int hdr_len = skb_network_header_len(skb);
-	int ah_hlen = (ah->hdrlen + 2) << 2;
+	int ah_hlen = ipv6_authlen(ah);
 
 	if (err)
 		goto out;
@@ -546,7 +546,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	ahash = ahp->ahash;
 
 	nexthdr = ah->nexthdr;
-	ah_hlen = (ah->hdrlen + 2) << 2;
+	ah_hlen = ipv6_authlen(ah);
 
 	if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
 	    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 11a43ee..b358f1a 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -266,7 +266,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 		} else if (nexthdr == NEXTHDR_AUTH) {
 			if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
 				break;
-			hdrlen = (hp->hdrlen + 2) << 2;
+			hdrlen = ipv6_authlen(hp);
 		} else
 			hdrlen = ipv6_optlen(hp);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b80fde1..3134fbb 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -416,7 +416,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
 				break;
 			optlen = 8;
 		} else if (nexthdr == NEXTHDR_AUTH) {
-			optlen = (hdr->hdrlen + 2) << 2;
+			optlen = ipv6_authlen(hdr);
 		} else {
 			optlen = ipv6_optlen(hdr);
 		}
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 0228ff3..4e15a14 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -55,7 +55,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 		return false;
 	}
 
-	hdrlen = (ah->hdrlen + 2) << 2;
+	hdrlen = ipv6_authlen(ah);
 
 	pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
 	pr_debug("RES %04X ", ah->reserved);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index fd439f8..0fc6326 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -71,7 +71,7 @@
 		if (nexthdr == NEXTHDR_FRAGMENT)
 			hdrlen = 8;
 		else if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hp->hdrlen + 2) << 2;
+			hdrlen = ipv6_authlen(hp);
 		else
 			hdrlen = ipv6_optlen(hp);
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 84322ce..16de015 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -421,7 +421,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
 			BUG();
 		if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hdr.hdrlen+2)<<2;
+			hdrlen = ipv6_authlen(&hdr);
 		else
 			hdrlen = ipv6_optlen(&hdr);
 
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 549c511..f53bd8f 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -155,7 +155,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
 
 			}
 
-			hdrlen = (hp->hdrlen+2)<<2;
+			hdrlen = ipv6_authlen(hp);
 			break;
 		case IPPROTO_ESP:
 			if (logflags & NF_LOG_IPOPT) {
-- 
1.8.3.1


^ permalink raw reply related

* Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
From: Jason Gunthorpe @ 2019-07-10 13:32 UTC (permalink / raw)
  To: Henry Orosco
  Cc: Saleem, Shiraz, Leon Romanovsky, Kirsher, Jeffrey T,
	dledford@redhat.com, davem@davemloft.net, Ismail, Mustafa,
	linux-rdma@vger.kernel.org, netdev@vger.kernel.org,
	nhorman@redhat.com, sassmann@redhat.com, poswald@suse.com,
	Ertman, David M
In-Reply-To: <20190709205613.GA7440@horosco-MOBL2.amr.corp.intel.com>

On Tue, Jul 09, 2019 at 03:56:13PM -0500, Henry Orosco wrote:
> On Mon, Jul 08, 2019 at 02:13:39PM +0000, Jason Gunthorpe wrote:
> > On Sat, Jul 06, 2019 at 04:15:20PM +0000, Saleem, Shiraz wrote:
> > > > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > > > 
> > > > On Fri, Jul 05, 2019 at 04:42:19PM +0000, Saleem, Shiraz wrote:
> > > > > > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > > > > >
> > > > > > On Thu, Jul 04, 2019 at 10:40:21AM +0300, Leon Romanovsky wrote:
> > > > > > > On Wed, Jul 03, 2019 at 07:12:57PM -0700, Jeff Kirsher wrote:
> > > > > > > > From: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > > > > >
> > > > > > > > Add ABI definitions for irdma.
> > > > > > > >
> > > > > > > > Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> > > > > > > > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > > > > > > > include/uapi/rdma/irdma-abi.h | 130
> > > > > > > > ++++++++++++++++++++++++++++++++++
> > > > > > > >  1 file changed, 130 insertions(+)  create mode 100644
> > > > > > > > include/uapi/rdma/irdma-abi.h
> > > > > > > >
> > > > > > > > diff --git a/include/uapi/rdma/irdma-abi.h
> > > > > > > > b/include/uapi/rdma/irdma-abi.h new file mode 100644 index
> > > > > > > > 000000000000..bdfbda4c829e
> > > > > > > > +++ b/include/uapi/rdma/irdma-abi.h
> > > > > > > > @@ -0,0 +1,130 @@
> > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> > > > > > > > +/* Copyright (c) 2006 - 2019 Intel Corporation.  All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> > > > > > > > + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +#ifndef IRDMA_ABI_H
> > > > > > > > +#define IRDMA_ABI_H
> > > > > > > > +
> > > > > > > > +#include <linux/types.h>
> > > > > > > > +
> > > > > > > > +/* irdma must support legacy GEN_1 i40iw kernel
> > > > > > > > + * and user-space whose last ABI ver is 5  */ #define
> > > > > > > > +IRDMA_ABI_VER
> > > > > > > > +6
> > > > > > >
> > > > > > > Can you please elaborate about it more?
> > > > > > > There is no irdma code in RDMA yet, so it makes me wonder why new
> > > > > > > define shouldn't start from 1.
> > > > > >
> > > > > > It is because they are ABI compatible with the current user space,
> > > > > > which raises the question why we even have this confusing header file..
> > > > >
> > > > > It is because we need to support current providers/i40iw user-space.
> > > > > Our user-space patch series will introduce a new provider (irdma)
> > > > > whose ABI ver. is also 6 (capable of supporting X722 and which will
> > > > > work with i40iw driver on older kernels) and removes providers/i40iw from rdma-
> > > > core.
> > > > 
> > > > Why on earth would we do that?
> > > > 
> > > A unified library providers/irdma to go in hand with the driver irdma and uses the ABI header.
> > > It can support the new network device e810 and existing x722 iWARP device. It obsoletes
> > > providers/i40iw and extends its ABI. So why keep providers/i40iw around in rdma-core?
> > 
> > Why rewrite a perfectly good userspace that is compatible with the
> > future and past kernels?
> > 
> > Is there something so wrong with the userspace provider to need this?
> >
> 
> Yes, the issue is that providers/i40iw was never designed to work with a unified driver
> which supports multiple hardware generations.

But Shiraz said it works fine with the new kernel driver.. So what is
actually the problem?

Jason

^ permalink raw reply

* [PATCH] libertas: Add missing sentinel at end of if_usb.c fw_table
From: Kevin Easton @ 2019-07-10 13:31 UTC (permalink / raw)
  To: linux-wireless
  Cc: andreyknvl, davem, kvalo, libertas-dev, linux-kernel, syzbot,
	netdev, syzkaller-bugs

This sentinel tells the firmware loading process when to stop.

Reported-and-tested-by: syzbot+98156c174c5a2cad9f8f@syzkaller.appspotmail.com
Signed-off-by: Kevin Easton <kevin@guarana.org>
---
 drivers/net/wireless/marvell/libertas/if_usb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c
index f1622f0ff8c9..fe3142d85d1e 100644
--- a/drivers/net/wireless/marvell/libertas/if_usb.c
+++ b/drivers/net/wireless/marvell/libertas/if_usb.c
@@ -50,7 +50,8 @@ static const struct lbs_fw_table fw_table[] = {
 	{ MODEL_8388, "libertas/usb8388_v5.bin", NULL },
 	{ MODEL_8388, "libertas/usb8388.bin", NULL },
 	{ MODEL_8388, "usb8388.bin", NULL },
-	{ MODEL_8682, "libertas/usb8682.bin", NULL }
+	{ MODEL_8682, "libertas/usb8682.bin", NULL },
+	{ 0, NULL, NULL }
 };
 
 static const struct usb_device_id if_usb_table[] = {
-- 
2.11.0
 

^ permalink raw reply related

* [PATCH net] ipv6: tcp: fix flowlabels reflection for RST packets
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Marek Majkowski

In 323a53c41292 ("ipv6: tcp: enable flowlabel reflection in some RST packets")
and 50a8accf1062 ("ipv6: tcp: send consistent flowlabel in TIME_WAIT state")
we took care of IPv6 flowlabel reflections for two cases.

This patch takes care of the remaining case, when the RST packet
is sent on behalf of a 'full' socket.

In Marek use case, this was a socket in TCP_CLOSE state.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marek Majkowski <marek@cloudflare.com>
Tested-by: Marek Majkowski <marek@cloudflare.com>
---
 net/ipv6/tcp_ipv6.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d56a9019a0feb5a34312ec353c555f44b8c09b3d..5da069e91cacca4e84a3e41dae4746c9d38fcc46 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -984,8 +984,13 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)

 	if (sk) {
 		oif = sk->sk_bound_dev_if;
-		if (sk_fullsock(sk))
+		if (sk_fullsock(sk)) {
+			const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+
 			trace_tcp_send_reset(sk, skb);
+			if (np->repflow)
+				label = ip6_flowlabel(ipv6h);
+		}
 		if (sk->sk_state == TCP_TIME_WAIT)
 			label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
 	} else {
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH net] ipv6: fix potential crash in ip6_datagram_dst_update()
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, Willem de Bruijn, syzbot
In-Reply-To: <20190710134011.221210-1-edumazet@google.com>

Willem forgot to change one of the calls to fl6_sock_lookup(),
which can now return an error or NULL.

syzbot reported :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 31763 Comm: syz-executor.0 Not tainted 5.2.0-rc6+ #63
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee
RSP: 0018:ffff88809ba97ae0 EFLAGS: 00010207
RAX: dffffc0000000000 RBX: ffff8880a81254b0 RCX: ffffc90008118000
RDX: 0000000000000003 RSI: ffffffff86319a84 RDI: 000000000000001e
RBP: ffff88809ba97c10 R08: ffff888065e9e700 R09: ffffed1015d26c80
R10: ffffed1015d26c7f R11: ffff8880ae9363fb R12: ffff8880a8124f40
R13: 0000000000000001 R14: fffffffffffffffe R15: ffff88809ba97b40
FS:  00007f38e606a700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000202c0140 CR3: 00000000a026a000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 __ip6_datagram_connect+0x5e9/0x1390 net/ipv6/datagram.c:246
 ip6_datagram_connect+0x30/0x50 net/ipv6/datagram.c:269
 ip6_datagram_connect_v6_only+0x69/0x90 net/ipv6/datagram.c:281
 inet_dgram_connect+0x14a/0x2d0 net/ipv4/af_inet.c:571
 __sys_connect+0x264/0x330 net/socket.c:1824
 __do_sys_connect net/socket.c:1835 [inline]
 __se_sys_connect net/socket.c:1832 [inline]
 __x64_sys_connect+0x73/0xb0 net/socket.c:1832
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f38e6069c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004597c9
RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f38e606a6d4
R13: 00000000004bfd07 R14: 00000000004d1838 R15: 00000000ffffffff
Modules linked in:
RIP: 0010:ip6_datagram_dst_update+0x559/0xc30 net/ipv6/datagram.c:83
Code: 00 00 e8 ea 29 3f fb 4d 85 f6 0f 84 96 04 00 00 e8 dc 29 3f fb 49 8d 7e 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 16 06 00 00 4d 8b 6e 20 e8 b4 29 3f fb 4c 89 ee

Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/ipv6/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9d78c907b918a98cbb9e80154a038e31b6bddd11..9ab897ded4df52d882cda1414ef0159f3eb1765a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -74,7 +74,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 
 	if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
 		flowlabel = fl6_sock_lookup(sk, np->flow_label);
-		if (!flowlabel)
+		if (IS_ERR(flowlabel))
 			return -EINVAL;
 	}
 	ip6_datagram_flow_key_init(&fl6, sk);
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

* [PATCH net] ipv6: fix static key imbalance in fl_create()
From: Eric Dumazet @ 2019-07-10 13:40 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, Willem de Bruijn, syzbot
In-Reply-To: <20190710134011.221210-1-edumazet@google.com>

fl_create() should call static_branch_deferred_inc() only in
case of success.

Also we should not call fl_free() in error path, as this could
cause a static key imbalance.

jump label: negative count!
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec kernel/jump_label.c:221 [inline]
WARNING: CPU: 0 PID: 15907 at kernel/jump_label.c:221 static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 15907 Comm: syz-executor.2 Not tainted 5.2.0-rc6+ #62
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 panic+0x2cb/0x744 kernel/panic.c:219
 __warn.cold+0x20/0x4d kernel/panic.c:576
 report_bug+0x263/0x2b0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:179 [inline]
 fixup_bug arch/x86/kernel/traps.c:174 [inline]
 do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:986
RIP: 0010:static_key_slow_try_dec kernel/jump_label.c:221 [inline]
RIP: 0010:static_key_slow_try_dec+0x1ab/0x1d0 kernel/jump_label.c:206
Code: c0 e8 e9 3e e5 ff 83 fb 01 0f 85 32 ff ff ff e8 5b 3d e5 ff 45 31 ff eb a0 e8 51 3d e5 ff 48 c7 c7 40 99 92 87 e8 13 75 b7 ff <0f> 0b eb 8b 4c 89 e7 e8 a9 c0 1e 00 e9 de fe ff ff e8 bf 6d b7 ff
RSP: 0018:ffff88805f9c7450 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 00000000ffffffff RCX: 0000000000000000
RDX: 000000000000e3e1 RSI: ffffffff815adb06 RDI: ffffed100bf38e7c
RBP: ffff88805f9c74e0 R08: ffff88806acf0700 R09: ffffed1015d060a9
R10: ffffed1015d060a8 R11: ffff8880ae830547 R12: ffffffff89832ce0
R13: ffff88805f9c74b8 R14: 1ffff1100bf38e8b R15: 00000000ffffff01
 __static_key_slow_dec_deferred+0x65/0x110 kernel/jump_label.c:272
 fl_free+0xa9/0xe0 net/ipv6/ip6_flowlabel.c:121
 fl_create+0x6af/0x9f0 net/ipv6/ip6_flowlabel.c:457
 ipv6_flowlabel_opt+0x80e/0x2730 net/ipv6/ip6_flowlabel.c:624
 do_ipv6_setsockopt.isra.0+0x2119/0x4100 net/ipv6/ipv6_sockglue.c:825
 ipv6_setsockopt+0xf6/0x170 net/ipv6/ipv6_sockglue.c:944
 tcp_setsockopt net/ipv4/tcp.c:3131 [inline]
 tcp_setsockopt+0x8f/0xe0 net/ipv4/tcp.c:3125
 sock_common_setsockopt+0x94/0xd0 net/core/sock.c:3130
 __sys_setsockopt+0x253/0x4b0 net/socket.c:2080
 __do_sys_setsockopt net/socket.c:2096 [inline]
 __se_sys_setsockopt net/socket.c:2093 [inline]
 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2093
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4597c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f2670556c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00000000004597c9
RDX: 0000000000000020 RSI: 0000000000000029 RDI: 0000000000000003
RBP: 000000000075bfc8 R08: 000000000000fdf7 R09: 0000000000000000
R10: 0000000020000000 R11: 0000000000000246 R12: 00007f26705576d4
R13: 00000000004cec00 R14: 00000000004dd520 R15: 00000000ffffffff
Kernel Offset: disabled
Rebooting in 86400 seconds..

Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/ipv6/ip6_flowlabel.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index ad284b1fd308a646f27f715f35d9759fd50c5902..d64b83e856428195c1ecc963a263155c8b4528d0 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -435,8 +435,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 	}
 	fl->dst = freq->flr_dst;
 	atomic_set(&fl->users, 1);
-	if (fl_shared_exclusive(fl) || fl->opt)
-		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 	switch (fl->share) {
 	case IPV6_FL_S_EXCL:
 	case IPV6_FL_S_ANY:
@@ -451,10 +449,15 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 		err = -EINVAL;
 		goto done;
 	}
+	if (fl_shared_exclusive(fl) || fl->opt)
+		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 	return fl;
 
 done:
-	fl_free(fl);
+	if (fl) {
+		kfree(fl->opt);
+		kfree(fl);
+	}
 	*err_p = err;
 	return NULL;
 }
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

* [PATCH net-next] net/sched: Fix kernel NULL pointer dereference
From: wenxu @ 2019-07-10 13:45 UTC (permalink / raw)
  To: pablo, davem; +Cc: netfilter-devel, netdev

From: wenxu <wenxu@ucloud.cn>

[  697.665184] BUG: kernel NULL pointer dereference, address: 0000000000000030
[  697.665550] #PF: supervisor read access in kernel mode
[  697.665906] #PF: error_code(0x0000) - not-present page
[  697.666297] PGD 800000104e636067 P4D 800000104e636067 PUD ff4b02067 PMD 0
[  697.666710] Oops: 0000 [#1] SMP PTI
[  697.667115] CPU: 31 PID: 24466 Comm: modprobe Kdump: loaded Tainted: G           O      5.2.0-rc6+ #1
[  697.667867] Hardware name: Huawei Technologies Co., Ltd. RH1288 V3/BC11HGSC0, BIOS 3.57 02/26/2017
[  697.668620] RIP: 0010:tc_indr_block_ing_cmd.isra.52+0x4c/0xb0
[  697.669029] Code: 83 ec 40 65 48 8b 04 25 28 00 00 00 48 89 45 e8 31 c0 f3 48 ab 48 8b 06 49 8b b3 e8 04 00 00 44 89 45 b0 c7 45 b4 01 00 00 00 <8b> 48 30 48 89 75 c0 85 c9 48 8d 4d b0 0f 95 45 b8 48 85 c0 4c 8d
[  697.670132] RSP: 0018:ffffc90007bf7958 EFLAGS: 00010246
[  697.670537] RAX: 0000000000000000 RBX: ffff88905e2cbae8 RCX: 0000000000000000
[  697.670938] RDX: ffff88905e2cbcd8 RSI: ffffffff823a8480 RDI: ffffc90007bf7990
[  697.671352] RBP: ffffc90007bf79a8 R08: 0000000000000000 R09: ffff88905e2cbcc0
[  697.671761] R10: ffff888107c07780 R11: ffff88902c249000 R12: ffff88905e2cbcd0
[  697.672173] R13: ffff88905e2cbac0 R14: ffff88885596bc00 R15: ffff88905e2cbcc0
[  697.672582] FS:  00007fe0b4095740(0000) GS:ffff88905fbc0000(0000) knlGS:0000000000000000
[  697.673335] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  697.673746] CR2: 0000000000000030 CR3: 0000000ff46b4005 CR4: 00000000001606e0
[  697.674156] Call Trace:
[  697.674563]  __tc_indr_block_cb_register+0x11e/0x3c0
[  697.674998]  mlx5e_nic_rep_netdevice_event+0x9e/0x110 [mlx5_core]
[  697.675411]  notifier_call_chain+0x53/0xa0
[  697.675812]  raw_notifier_call_chain+0x16/0x20
[  697.676223]  call_netdevice_notifiers_info+0x2d/0x60
[  697.676633]  register_netdevice+0x3fa/0x500

get indr_dev->block after check it.

Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/sched/cls_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 638c1bc..be899f7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -684,13 +684,14 @@ static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
 		.net		= dev_net(indr_dev->dev),
-		.block_shared	= tcf_block_shared(indr_dev->block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
 	if (!indr_dev->block)
 		return;
 
+	bo.block_shared	= tcf_block_shared(indr_dev->block);
+
 	indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
 			  &bo);
 	tcf_block_setup(indr_dev->block, &bo);
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net] net: fix use-after-free in __netif_receive_skb_core
From: Sabrina Dubroca @ 2019-07-10 13:52 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, Edward Cree, Andreas Steinmetz

When __netif_receive_skb_core handles a shared skb, it can be
reallocated in a few different places:
 - the device's rx_handler
 - vlan_do_receive
 - skb_vlan_untag

To deal with that, rx_handlers and vlan_do_receive get passed a
reference to the skb, and skb_vlan_untag just returns the new
skb. This was not a problem until commit 88eb1944e18c ("net: core:
propagate SKB lists through packet_type lookup"), which moved the
final handling of the skb via pt_prev out of
__netif_receive_skb_core. After this commit, when the skb is
reallocated by __netif_receive_skb_core, KASAN reports a
use-after-free on the old skb:

BUG: KASAN: use-after-free in __netif_receive_skb_one_core+0x15c/0x180
Call Trace:
 <IRQ>
 __netif_receive_skb_one_core+0x15c/0x180
 process_backlog+0x1b5/0x630
 ? net_rx_action+0x247/0xd00
 net_rx_action+0x3fa/0xd00
 ? napi_complete_done+0x360/0x360
 __do_softirq+0x257/0xa0b
 do_softirq_own_stack+0x2a/0x40
 </IRQ>
 ? __dev_queue_xmit+0x12ba/0x3120
 do_softirq+0x5d/0x60
 [...]

Allocated by task 505:
 __kasan_kmalloc.constprop.0+0xd6/0x140
 kmem_cache_alloc+0xd4/0x2e0
 skb_clone+0x106/0x300
 deliver_clone+0x3f/0xa0
 maybe_deliver+0x1c0/0x2b0
 br_flood+0xd4/0x320
 br_dev_xmit+0xbc0/0x1080
 dev_hard_start_xmit+0x139/0x750
 __dev_queue_xmit+0x24eb/0x3120
 packet_sendmsg+0x1bfa/0x50e0
 [...]

Freed by task 505:
 __kasan_slab_free+0x138/0x1e0
 kmem_cache_free+0xa2/0x2e0
 macsec_handle_frame+0xa24/0x2e60
 __netif_receive_skb_core+0xe2a/0x2c90
 __netif_receive_skb_one_core+0x96/0x180
 process_backlog+0x1b5/0x630
 net_rx_action+0x3fa/0xd00
 __do_softirq+0x257/0xa0b

The solution is to pass a reference to the skb to
__netif_receive_skb_core, as we already do with the rx_handlers, so
that its callers use the new skb.

Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup")
Reported-by: Andreas Steinmetz <ast@domdv.de>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/core/dev.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d6edd218babd..0bbf6d2a9c32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4809,11 +4809,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 	return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 				    struct packet_type **ppt_prev)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
+	struct sk_buff *skb = *pskb;
 	struct net_device *orig_dev;
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
@@ -4852,6 +4853,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 		skb = skb_vlan_untag(skb);
+		*pskb = skb;
 		if (unlikely(!skb))
 			goto out;
 	}
@@ -4878,6 +4880,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 #ifdef CONFIG_NET_INGRESS
 	if (static_branch_unlikely(&ingress_needed_key)) {
 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+		*pskb = skb;
 		if (!skb)
 			goto out;
 
@@ -4891,11 +4894,14 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 		goto drop;
 
 	if (skb_vlan_tag_present(skb)) {
+		bool ret2;
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
-		if (vlan_do_receive(&skb))
+		ret2 = vlan_do_receive(pskb);
+		skb = *pskb;
+		if (ret2)
 			goto another_round;
 		else if (unlikely(!skb))
 			goto out;
@@ -4903,11 +4909,14 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
+		rx_handler_result_t res;
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
-		switch (rx_handler(&skb)) {
+		res = rx_handler(pskb);
+		skb = *pskb;
+		switch (res) {
 		case RX_HANDLER_CONSUMED:
 			ret = NET_RX_SUCCESS;
 			goto out;
@@ -4931,15 +4940,20 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 			skb->pkt_type = PACKET_OTHERHOST;
 		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+			bool ret2;
+
 			/* Outer header is 802.1P with vlan 0, inner header is
 			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 			 * not find vlan dev for vlan id 0.
 			 */
 			__vlan_hwaccel_clear_tag(skb);
 			skb = skb_vlan_untag(skb);
+			*pskb = skb;
 			if (unlikely(!skb))
 				goto out;
-			if (vlan_do_receive(&skb))
+			ret2 = vlan_do_receive(pskb);
+			skb = *pskb;
+			if (ret2)
 				/* After stripping off 802.1P header with vlan 0
 				 * vlan dev is found for inner header.
 				 */
@@ -5004,7 +5018,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 	struct packet_type *pt_prev = NULL;
 	int ret;
 
-	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 	if (pt_prev)
 		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 					 skb->dev, pt_prev, orig_dev);
@@ -5082,7 +5096,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 		struct packet_type *pt_prev = NULL;
 
 		skb_list_del_init(skb);
-		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 		if (!pt_prev)
 			continue;
 		if (pt_curr != pt_prev || od_curr != orig_dev) {
-- 
2.22.0


^ permalink raw reply related

* Re: [PATCH net-next iproute2 v2 2/2] devlink: Introduce PCI PF and VF port flavour and attribute
From: Jiri Pirko @ 2019-07-10 13:57 UTC (permalink / raw)
  To: Parav Pandit; +Cc: netdev, stephen, dsahern, jiri
In-Reply-To: <20190710123952.6877-2-parav@mellanox.com>

Wed, Jul 10, 2019 at 02:39:52PM CEST, parav@mellanox.com wrote:
>Introduce PCI PF and VF port flavour and port attributes such as PF
>number and VF number.
>
>$ devlink port show
>pci/0000:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0
>pci/0000:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
>pci/0000:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1
>
>Signed-off-by: Parav Pandit <parav@mellanox.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* [PATCH RFC 0/4] Add support to directly attach BPF program to ftrace
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song

Hi,
These patches make it possible to attach BPF programs directly to tracepoints
using ftrace (/sys/kernel/debug/tracing) without needing the process doing the
attach to be alive. This has the following benefits:

1. Simplified Security: In Android, we have finer-grained security controls to
specific ftrace trace events using SELinux labels. We control precisely who is
allowed to enable an ftrace event already. By adding a node to ftrace for
attaching BPF programs, we can use the same mechanism to further control who is
allowed to attach to a trace event.

2. Process lifetime: In Android we are adding usecases where a tracing program
needs to be attached all the time to a tracepoint, for the full life time of
the system. Such as to gather statistics where there no need for a detach for
the full system lifetime. With perf or bpf(2)'s BPF_RAW_TRACEPOINT_OPEN, this
means keeping a process alive all the time.  However, in Android our BPF loader
currently (for hardeneded security) involves just starting a process at boot
time, doing the BPF program loading, and then pinning them to /sys/fs/bpf.  We
don't keep this process alive all the time. It is more suitable to do a
one-shot attach of the program using ftrace and not need to have a process
alive all the time anymore for this. Such process also needs elevated
privileges since tracepoint program loading currently requires CAP_SYS_ADMIN
anyway so by design Android's bpfloader runs once at init and exits.

This series add a new bpf file to /sys/kernel/debug/tracing/events/X/Y/bpf
The following commands can be written into it:
attach:<fd>     Attaches BPF prog fd to tracepoint
detach:<fd>     Detaches BPF prog fd to tracepoint

Reading the bpf file will show all the attached programs to the tracepoint.

Joel Fernandes (Google) (4):
Move bpf_raw_tracepoint functionality into bpf_trace.c
trace/bpf: Add support for attach/detach of ftrace events to BPF
lib/bpf: Add support for ftrace event attach and detach
selftests/bpf: Add test for ftrace-based BPF attach/detach

include/linux/bpf_trace.h                     |  16 ++
include/linux/trace_events.h                  |   1 +
kernel/bpf/syscall.c                          |  69 +-----
kernel/trace/bpf_trace.c                      | 225 ++++++++++++++++++
kernel/trace/trace.h                          |   1 +
kernel/trace/trace_events.c                   |   8 +
tools/lib/bpf/bpf.c                           |  53 +++++
tools/lib/bpf/bpf.h                           |   4 +
tools/lib/bpf/libbpf.map                      |   2 +
.../raw_tp_writable_test_ftrace_run.c         |  89 +++++++
10 files changed, 410 insertions(+), 58 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c

--
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply

* [PATCH RFC 1/4] Move bpf_raw_tracepoint functionality into bpf_trace.c
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>

In preparation to use raw tracepoints for BPF directly from ftrace, move
the bpf_raw_tracepoint functionality into bpf_trace.c

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/bpf_trace.h   | 10 ++++++
 kernel/bpf/syscall.c        | 69 ++++++-------------------------------
 kernel/trace/bpf_trace.c    | 56 ++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c |  3 ++
 4 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index ddf896abcfb6..4a593827fd87 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -4,4 +4,14 @@
 
 #include <trace/events/xdp.h>
 
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+struct bpf_raw_tracepoint {
+	struct bpf_raw_event_map *btp;
+	struct bpf_prog *prog;
+};
+
+struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
+void bpf_raw_tracepoint_close(struct bpf_raw_tracepoint *tp);
+
 #endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 42d17f730780..2001949b33f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1737,21 +1737,11 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
-struct bpf_raw_tracepoint {
-	struct bpf_raw_event_map *btp;
-	struct bpf_prog *prog;
-};
-
 static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_raw_tracepoint *raw_tp = filp->private_data;
 
-	if (raw_tp->prog) {
-		bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
-		bpf_prog_put(raw_tp->prog);
-	}
-	bpf_put_raw_tracepoint(raw_tp->btp);
-	kfree(raw_tp);
+	bpf_raw_tracepoint_close(raw_tp);
 	return 0;
 }
 
@@ -1761,64 +1751,27 @@ static const struct file_operations bpf_raw_tp_fops = {
 	.write		= bpf_dummy_write,
 };
 
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tracepoint_open_syscall(const union bpf_attr *attr)
 {
-	struct bpf_raw_tracepoint *raw_tp;
-	struct bpf_raw_event_map *btp;
-	struct bpf_prog *prog;
+	int tp_fd;
 	char tp_name[128];
-	int tp_fd, err;
+	struct bpf_raw_tracepoint *raw_tp;
 
 	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
 			      sizeof(tp_name) - 1) < 0)
 		return -EFAULT;
 	tp_name[sizeof(tp_name) - 1] = 0;
 
-	btp = bpf_get_raw_tracepoint(tp_name);
-	if (!btp)
-		return -ENOENT;
-
-	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-	if (!raw_tp) {
-		err = -ENOMEM;
-		goto out_put_btp;
-	}
-	raw_tp->btp = btp;
-
-	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
-	if (IS_ERR(prog)) {
-		err = PTR_ERR(prog);
-		goto out_free_tp;
-	}
-	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
-	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
-		err = -EINVAL;
-		goto out_put_prog;
-	}
-
-	err = bpf_probe_register(raw_tp->btp, prog);
-	if (err)
-		goto out_put_prog;
+	raw_tp = bpf_raw_tracepoint_open(tp_name, attr->raw_tracepoint.prog_fd);
+	if (IS_ERR(raw_tp))
+		return PTR_ERR(raw_tp);
 
-	raw_tp->prog = prog;
 	tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
 				 O_CLOEXEC);
-	if (tp_fd < 0) {
-		bpf_probe_unregister(raw_tp->btp, prog);
-		err = tp_fd;
-		goto out_put_prog;
-	}
-	return tp_fd;
+	if (tp_fd < 0)
+		bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
 
-out_put_prog:
-	bpf_prog_put(prog);
-out_free_tp:
-	kfree(raw_tp);
-out_put_btp:
-	bpf_put_raw_tracepoint(btp);
-	return err;
+	return tp_fd;
 }
 
 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
@@ -2848,7 +2801,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_info_by_fd(&attr, uattr);
 		break;
 	case BPF_RAW_TRACEPOINT_OPEN:
-		err = bpf_raw_tracepoint_open(&attr);
+		err = bpf_raw_tracepoint_open_syscall(&attr);
 		break;
 	case BPF_BTF_LOAD:
 		err = bpf_btf_load(&attr);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1c9a4745e596..c4b543bc617f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/bpf.h>
 #include <linux/bpf_perf_event.h>
+#include <linux/bpf_trace.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
@@ -1413,3 +1414,58 @@ static int __init bpf_event_init(void)
 
 fs_initcall(bpf_event_init);
 #endif /* CONFIG_MODULES */
+
+void bpf_raw_tracepoint_close(struct bpf_raw_tracepoint *raw_tp)
+{
+	if (raw_tp->prog) {
+		bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
+		bpf_prog_put(raw_tp->prog);
+	}
+	bpf_put_raw_tracepoint(raw_tp->btp);
+	kfree(raw_tp);
+}
+
+struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd)
+{
+	struct bpf_raw_tracepoint *raw_tp;
+	struct bpf_raw_event_map *btp;
+	struct bpf_prog *prog;
+	int err;
+
+	btp = bpf_get_raw_tracepoint(tp_name);
+	if (!btp)
+		return ERR_PTR(-ENOENT);
+
+	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
+	if (!raw_tp) {
+		err = -ENOMEM;
+		goto out_put_btp;
+	}
+	raw_tp->btp = btp;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out_free_tp;
+	}
+	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+		err = -EINVAL;
+		goto out_put_prog;
+	}
+
+	err = bpf_probe_register(raw_tp->btp, prog);
+	if (err)
+		goto out_put_prog;
+
+	raw_tp->prog = prog;
+	return raw_tp;
+
+out_put_prog:
+	bpf_prog_put(prog);
+out_free_tp:
+	kfree(raw_tp);
+out_put_btp:
+	bpf_put_raw_tracepoint(btp);
+	return ERR_PTR(err);
+}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0ce3db67f556..67851fb66b6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2017,6 +2017,9 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 
 		trace_create_file("trigger", 0644, file->dir, file,
 				  &event_trigger_fops);
+
+		trace_create_file("bpf_attach", 0644, file->dir, file,
+				  &bpf_attach_trigger_fops);
 	}
 
 #ifdef CONFIG_HIST_TRIGGERS
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

* [PATCH RFC 2/4] trace/bpf: Add support for attach/detach of ftrace events to BPF
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>

Add a new bpf file to each trace event. The following commands can be
written into it:
attach:<fd>	Attaches BPF prog fd to tracepoint
detach:<fd>	Detaches BPF prog fd to tracepoint

Reading the bpf file will show all the attached programs to the
tracepoint.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/bpf_trace.h    |   6 ++
 include/linux/trace_events.h |   1 +
 kernel/trace/bpf_trace.c     | 169 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         |   1 +
 kernel/trace/trace_events.c  |   9 +-
 5 files changed, 184 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index 4a593827fd87..1fe73501809c 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -9,6 +9,12 @@
 struct bpf_raw_tracepoint {
 	struct bpf_raw_event_map *btp;
 	struct bpf_prog *prog;
+	/*
+	 * Multiple programs can be attached to a tracepoint,
+	 * All of these are linked to each other and can be reached
+	 * from the event's bpf_attach file in tracefs.
+	 */
+	struct list_head event_attached;
 };
 
 struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..525f2ac44aa3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -371,6 +371,7 @@ struct trace_event_file {
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
+	struct list_head		bpf_attached;
 
 	/*
 	 * 32 bit flags:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c4b543bc617f..28621ad88c12 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1469,3 +1469,172 @@ struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd)
 	bpf_put_raw_tracepoint(btp);
 	return ERR_PTR(err);
 }
+
+enum event_bpf_cmd { BPF_ATTACH, BPF_DETACH };
+#define BPF_CMD_BUF_LEN 32
+
+static ssize_t
+event_bpf_attach_write(struct file *filp, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	int err, prog_fd, cmd_num, len;
+	struct trace_event_call *call;
+	struct trace_event_file *file;
+	struct bpf_raw_tracepoint *raw_tp, *next;
+	char buf[BPF_CMD_BUF_LEN], *end, *tok;
+	enum event_bpf_cmd cmd;
+	struct bpf_prog *prog;
+	bool prog_put = true;
+
+	len = min((int)cnt, BPF_CMD_BUF_LEN - 1);
+
+	err = copy_from_user(buf, ubuf, len);
+	if (err)
+		return err;
+	buf[len] = 0;
+
+	/* Parse 2 arguments of format: <cmd>:<fd> */
+	end = &buf[0];
+	cmd_num = 1;
+	while (cmd_num < 3) {
+		tok = strsep(&end, ":");
+		if (!tok)
+			return -EINVAL;
+
+		switch (cmd_num) {
+		case 1:
+			if (!strncmp(tok, "attach", 6))
+				cmd = BPF_ATTACH;
+			else if (!strncmp(tok, "detach", 6))
+				cmd = BPF_DETACH;
+			else
+				return -EINVAL;
+			break;
+		case 2:
+			err = kstrtoint(tok, 10, &prog_fd);
+			if (err)
+				return err;
+			break;
+		}
+		cmd_num++;
+	}
+	if (cmd_num != 3)
+		return -EINVAL;
+
+	file = event_file_data(filp);
+	/* Command is to attach fd to tracepoint */
+	if (cmd == BPF_ATTACH) {
+		mutex_lock(&event_mutex);
+		call = file->event_call;
+
+		raw_tp = bpf_raw_tracepoint_open((char *)call->tp->name,
+						 prog_fd);
+		if (IS_ERR(raw_tp)) {
+			mutex_unlock(&event_mutex);
+			return PTR_ERR(raw_tp);
+		}
+
+		list_add(&raw_tp->event_attached, &file->bpf_attached);
+		mutex_unlock(&event_mutex);
+		*ppos += cnt;
+		return cnt;
+	}
+
+	/* Command is to detach fd from tracepoint */
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry_safe(raw_tp, next, &file->bpf_attached,
+				 event_attached) {
+		if (raw_tp->prog == prog) {
+			list_del(&raw_tp->event_attached);
+			bpf_raw_tracepoint_close(raw_tp);
+			prog_put = false;
+			break;
+		}
+	}
+	mutex_unlock(&event_mutex);
+
+	if (prog_put)
+		bpf_prog_put(prog);
+	*ppos += cnt;
+	return cnt;
+}
+
+static void *event_bpf_attach_next(struct seq_file *m, void *t, loff_t *pos)
+{
+	struct trace_event_file *file = event_file_data(m->private);
+
+	return seq_list_next(t, &file->bpf_attached, pos);
+}
+
+static void *event_bpf_attach_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_event_file *event_file;
+
+	/* ->stop() is called even if ->start() fails */
+	mutex_lock(&event_mutex);
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file))
+		return ERR_PTR(-ENODEV);
+
+	if (list_empty(&event_file->bpf_attached))
+		return NULL;
+
+	return seq_list_start(&event_file->bpf_attached, *pos);
+}
+
+static void event_bpf_attach_stop(struct seq_file *m, void *t)
+{
+	mutex_unlock(&event_mutex);
+}
+
+static int event_bpf_attach_show(struct seq_file *m, void *v)
+{
+	struct bpf_raw_tracepoint *raw_tp;
+
+	raw_tp = list_entry(v, struct bpf_raw_tracepoint, event_attached);
+	seq_printf(m, "prog id: %u\n", raw_tp->prog->aux->id);
+	return 0;
+}
+
+static const struct seq_operations event_bpf_attach_seq_ops = {
+	.start = event_bpf_attach_start,
+	.next = event_bpf_attach_next,
+	.stop = event_bpf_attach_stop,
+	.show = event_bpf_attach_show,
+};
+
+static int event_bpf_attach_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	if (unlikely(!event_file_data(file))) {
+		mutex_unlock(&event_mutex);
+		return -ENODEV;
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		ret = seq_open(file, &event_bpf_attach_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+
+			m->private = file;
+		}
+	}
+
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+const struct file_operations event_bpf_attach_fops = {
+	.open = event_bpf_attach_open,
+	.read = seq_read,
+	.write = event_bpf_attach_write,
+	.llseek = default_llseek,
+};
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..e33828d24eb2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1582,6 +1582,7 @@ extern struct list_head ftrace_events;
 
 extern const struct file_operations event_trigger_fops;
 extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_bpf_attach_fops;
 
 #ifdef CONFIG_HIST_TRIGGERS
 extern int register_trigger_hist_cmd(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 67851fb66b6b..79420d5efaef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2018,8 +2018,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 		trace_create_file("trigger", 0644, file->dir, file,
 				  &event_trigger_fops);
 
-		trace_create_file("bpf_attach", 0644, file->dir, file,
-				  &bpf_attach_trigger_fops);
+#ifdef CONFIG_BPF_EVENTS
+		trace_create_file("bpf", 0644, file->dir, file,
+				  &event_bpf_attach_fops);
+#endif
 	}
 
 #ifdef CONFIG_HIST_TRIGGERS
@@ -2267,6 +2269,9 @@ trace_create_new_event(struct trace_event_call *call,
 	atomic_set(&file->sm_ref, 0);
 	atomic_set(&file->tm_ref, 0);
 	INIT_LIST_HEAD(&file->triggers);
+#ifdef CONFIG_BPF_EVENTS
+	INIT_LIST_HEAD(&file->bpf_attached);
+#endif
 	list_add(&file->list, &tr->events);
 
 	return file;
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

* [PATCH RFC 3/4] lib/bpf: Add support for ftrace event attach and detach
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>

Add the needed library support in this commit.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 tools/lib/bpf/bpf.c      | 53 ++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/bpf.h      |  4 +++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 59 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c4a48086dc9a..28c5a7d00d14 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -24,6 +24,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <memory.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #include <unistd.h>
 #include <asm/unistd.h>
 #include <linux/bpf.h>
@@ -57,6 +60,8 @@
 #define min(x, y) ((x) < (y) ? (x) : (y))
 #endif
 
+#define TRACEFS "/sys/kernel/debug/tracing"
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
 	return (__u64) (unsigned long) ptr;
@@ -658,6 +663,54 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 	return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
 }
 
+int bpf_raw_tracepoint_ftrace_attach(const char *subsys, const char *name,
+				     int prog_fd)
+{
+	char buf[256];
+	int len, ret, tfd;
+
+	sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+	tfd = open(buf, O_WRONLY);
+	if (tfd < 0)
+		return tfd;
+
+	sprintf(buf, "attach:%d", prog_fd);
+	len = strlen(buf);
+	ret = write(tfd, buf, len);
+
+	if (ret < 0)
+		goto err;
+	if (ret != len)
+		ret = -1;
+err:
+	close(tfd);
+	return ret;
+}
+
+int bpf_raw_tracepoint_ftrace_detach(const char *subsys, const char *name,
+				     int prog_fd)
+{
+	char buf[256];
+	int len, ret, tfd;
+
+	sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+	tfd = open(buf, O_WRONLY);
+	if (tfd < 0)
+		return tfd;
+
+	sprintf(buf, "detach:%d", prog_fd);
+	len = strlen(buf);
+	ret = write(tfd, buf, len);
+
+	if (ret < 0)
+		goto err;
+	if (ret != len)
+		ret = -1;
+err:
+	close(tfd);
+	return ret;
+}
+
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log)
 {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 9593fec75652..5b9c44658037 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -163,6 +163,10 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
 			      __u32 query_flags, __u32 *attach_flags,
 			      __u32 *prog_ids, __u32 *prog_cnt);
 LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_attach(const char *subsys,
+						const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_detach(const char *subsys,
+						const char *name, int prog_fd);
 LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf,
 			    __u32 log_buf_size, bool do_log);
 LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 673001787cba..fca377b688c2 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -163,4 +163,6 @@ LIBBPF_0.0.3 {
 		bpf_map__is_internal;
 		bpf_map_freeze;
 		btf__finalize_data;
+		bpf_raw_tracepoint_ftrace_attach;
+		bpf_raw_tracepoint_ftrace_detach;
 } LIBBPF_0.0.2;
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

* [PATCH RFC 4/4] selftests/bpf: Add test for ftrace-based BPF attach/detach
From: Joel Fernandes (Google) @ 2019-07-10 14:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190710141548.132193-1-joel@joelfernandes.org>

Here we add support for testing the attach and detach of a BPF program
to a tracepoint through tracefs.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 .../raw_tp_writable_test_ftrace_run.c         | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c

diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
new file mode 100644
index 000000000000..7b42e3a69b71
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <linux/nbd.h>
+
+void test_raw_tp_writable_test_ftrace_run(void)
+{
+	__u32 duration = 0;
+	char error[4096];
+	int ret;
+
+	const struct bpf_insn trace_program[] = {
+		BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 42),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+
+	struct bpf_load_program_attr load_attr = {
+		.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+		.license = "GPL v2",
+		.insns = trace_program,
+		.insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+		.log_level = 2,
+	};
+
+	int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+
+	if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
+		  "failed: %d errno %d\n", bpf_fd, errno))
+		return;
+
+	const struct bpf_insn skb_program[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+
+	struct bpf_load_program_attr skb_load_attr = {
+		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+		.license = "GPL v2",
+		.insns = skb_program,
+		.insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
+	};
+
+	int filter_fd =
+		bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+	if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
+		  filter_fd, errno))
+		goto out_bpffd;
+
+	ret = bpf_raw_tracepoint_ftrace_attach("bpf_test_run",
+					       "bpf_test_finish",
+					       bpf_fd);
+	if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_attach",
+		  "failed: %d errno %d\n", ret, errno))
+		goto out_filterfd;
+
+	char test_skb[128] = {
+		0,
+	};
+
+	__u32 prog_ret;
+	int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
+				    0, &prog_ret, 0);
+	CHECK(err != 42, "test_run",
+	      "tracepoint did not modify return value\n");
+	CHECK(prog_ret != 0, "test_run_ret",
+	      "socket_filter did not return 0\n");
+
+	ret = bpf_raw_tracepoint_ftrace_detach("bpf_test_run",
+					       "bpf_test_finish",
+					       bpf_fd);
+	if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_detach",
+		  "failed: %d errno %d\n", ret, errno))
+		goto out_filterfd;
+
+	err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
+				&prog_ret, 0);
+	CHECK(err != 0, "test_run_notrace",
+	      "test_run failed with %d errno %d\n", err, errno);
+	CHECK(prog_ret != 0, "test_run_ret_notrace",
+	      "socket_filter did not return 0\n");
+
+out_filterfd:
+	close(filter_fd);
+out_bpffd:
+	close(bpf_fd);
+}
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox