Netdev List

Netdev List
 help / color / mirror / Atom feed

* [iproute2 net-next 8/8] Introduce ip vrf command
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

'ip vrf' follows the user semnatics established by 'ip netns'.

The 'ip vrf' subcommand supports 3 usages:

1. Run a command against a given vrf:
       ip vrf exec NAME CMD

   Uses the recently committed cgroup/sock BPF option. vrf directory
   is added to cgroup2 mount. Individual vrfs are created under it. BPF
   filter attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the VRF
   device index. From there the current process (ip's pid) is addded to
   the cgroups.proc file and the given command is exected. In doing so
   all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically bound to
   the VRF domain.

   The association is inherited parent to child allowing the command to
   be a shell from which other commands are run relative to the VRF.

2. Show the VRF a process is bound to:
       ip vrf id
   This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
   entry with the VRF name following.

3. Show process ids bound to a VRF
       ip vrf pids NAME
   This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
   shows the process ids in the particular vrf cgroup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/Makefile       |   3 +-
 ip/ip.c           |   4 +-
 ip/ip_common.h    |   2 +
 ip/ipvrf.c        | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man8/ip-vrf.8 |  88 +++++++++++++++++
 5 files changed, 384 insertions(+), 2 deletions(-)
 create mode 100644 ip/ipvrf.c
 create mode 100644 man/man8/ip-vrf.8

diff --git a/ip/Makefile b/ip/Makefile
index c8e6c6172741..1928489e7f90 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -7,7 +7,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
     iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
-    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o
+    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \
+    ipvrf.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index cb3adcb3f57d..07050b07592a 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -51,7 +51,8 @@ static void usage(void)
 "       ip [ -force ] -batch filename\n"
 "where  OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n"
-"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila }\n"
+"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila |\n"
+"                   vrf }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -h[uman-readable] | -iec |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n"
@@ -99,6 +100,7 @@ static const struct cmd {
 	{ "mrule",	do_multirule },
 	{ "netns",	do_netns },
 	{ "netconf",	do_ipnetconf },
+	{ "vrf",	do_ipvrf},
 	{ "help",	do_help },
 	{ 0 }
 };
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 3162f1ca5b2c..28763e81e4a4 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -57,6 +57,8 @@ extern int do_ipila(int argc, char **argv);
 int do_tcp_metrics(int argc, char **argv);
 int do_ipnetconf(int argc, char **argv);
 int do_iptoken(int argc, char **argv);
+int do_ipvrf(int argc, char **argv);
+
 int iplink_get(unsigned int flags, char *name, __u32 filt_mask);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
diff --git a/ip/ipvrf.c b/ip/ipvrf.c
new file mode 100644
index 000000000000..c4f0e53532e2
--- /dev/null
+++ b/ip/ipvrf.c
@@ -0,0 +1,289 @@
+/*
+ * ipvrf.c	"ip vrf"
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <linux/bpf.h>
+#include <linux/if.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define CGRP_PROC_FILE  "/cgroup.procs"
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip vrf exec [NAME] cmd ...\n");
+	fprintf(stderr, "       ip vrf identify [PID]\n");
+	fprintf(stderr, "       ip vrf pids [NAME]\n");
+
+	exit(-1);
+}
+
+static int ipvrf_identify(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *vrf, *end;
+	int fd, rc = -1;
+	unsigned int pid;
+	ssize_t n;
+
+	if (argc < 1)
+		pid = getpid();
+	else if (argc > 1)
+		invarg("Extra arguments specified\n", argv[1]);
+	else if (get_unsigned(&pid, argv[0], 10))
+		invarg("Invalid pid\n", argv[0]);
+
+	snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr,
+			"Failed to open cgroups file: %s\n", strerror(errno));
+		return -1;
+	}
+
+	n = read(fd, buf, sizeof(buf) - 1);
+	if (n < 0) {
+		fprintf(stderr,
+			"Failed to read cgroups file: %s\n", strerror(errno));
+		goto out;
+	}
+	buf[n] = '\0';
+	vrf = strstr(buf, "::/vrf/");
+	if (vrf) {
+		vrf += 7;  /* skip past "::/vrf/" */
+		end = strchr(vrf, '\n');
+		if (end)
+			*end = '\0';
+
+		printf("%s\n", vrf);
+	}
+
+	rc = 0;
+out:
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_pids(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *mnt, *vrf;
+	int fd, rc = -1;
+	ssize_t n;
+
+	if (argc != 1) {
+		fprintf(stderr, "Invalid arguments\n");
+		return -1;
+	}
+
+	vrf = argv[0];
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/vrf/%s%s", mnt, vrf, CGRP_PROC_FILE);
+	free(mnt);
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return 0; /* no cgroup file, nothing to show */
+
+	while (1) {
+		n = read(fd, buf, sizeof(buf) - 1);
+		if (n < 0) {
+			fprintf(stderr,
+				"Failed to read cgroups file: %s\n", strerror(errno));
+			break;
+		} else if (n == 0) {
+			rc = 0;
+			break;
+		}
+		printf("%s", buf);
+	}
+
+	close(fd);
+
+	return rc;
+}
+
+/* load BPF program to set sk_bound_dev_if for sockets */
+static char bpf_log_buf[256*1024];
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", bpf_log_buf, sizeof(bpf_log_buf));
+}
+
+static int vrf_configure_cgroup(const char *path, int ifindex)
+{
+	int rc = -1, cg_fd, prog_fd = -1;
+
+	cg_fd = open(path, O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		fprintf(stderr, "Failed to open cgroup path: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	/*
+	 * Load bpf program into kernel and attach to cgroup to affect
+	 * socket creates
+	 */
+	prog_fd = prog_load(ifindex);
+	if (prog_fd < 0) {
+		printf("Failed to load BPF prog: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	if (bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE)) {
+		fprintf(stderr, "Failed to attach prog to cgroup: '%s'\n",
+			strerror(errno));
+			fprintf(stderr, "Kernel compiled with CGROUP_BPF enabled?\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	close(cg_fd);
+	close(prog_fd);
+
+	return rc;
+}
+
+static int vrf_switch(const char *name)
+{
+	char path[PATH_MAX], *mnt, pid[16];
+	int ifindex = name_is_vrf(name);
+	bool default_vrf = false;
+	int rc = -1, len, fd = -1;
+
+	if (!ifindex) {
+		if (strcmp(name, "default")) {
+			fprintf(stderr, "Invalid VRF name\n");
+			return -1;
+		}
+		default_vrf = true;
+	}
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	/* path to cgroup; make sure buffer has room to cat "/cgroup.procs"
+	 * to the end of the path
+	 */
+	len = snprintf(path, sizeof(path) - sizeof(CGRP_PROC_FILE), "%s%s/%s",
+		       mnt, default_vrf ? "" : "/vrf", name);
+	if (len > sizeof(path) - sizeof(CGRP_PROC_FILE)) {
+		fprintf(stderr, "Invalid path to cgroup2 mount\n");
+		goto out;
+	}
+
+	if (make_path(path, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		goto out;
+	}
+
+	if (!default_vrf && vrf_configure_cgroup(path, ifindex))
+		goto out;
+
+	/*
+	 * write pid to cgroup.procs making process part of cgroup
+	 */
+	strcat(path, CGRP_PROC_FILE);
+	fd = open(path, O_RDWR | O_APPEND);
+	if (fd < 0) {
+		fprintf(stderr, "cgroups.procs file does not exist.\n");
+		goto out;
+	}
+
+	snprintf(pid, sizeof(pid), "%d", getpid());
+	if (write(fd, pid, strlen(pid)) < 0) {
+		fprintf(stderr, "Failed to join cgroup\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	free(mnt);
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_exec(int argc, char **argv)
+{
+	if (argc < 1) {
+		fprintf(stderr, "No VRF name specified\n");
+		return -1;
+	}
+	if (argc < 2) {
+		fprintf(stderr, "No command specified\n");
+		return -1;
+	}
+
+	if (vrf_switch(argv[0]))
+		return -1;
+
+	return -cmd_exec(argv[1], argv + 1, !!batch_mode);
+}
+
+int do_ipvrf(int argc, char **argv)
+{
+	if (argc == 0) {
+		fprintf(stderr, "No command given. Try \"ip vrf help\".\n");
+		exit(-1);
+	}
+
+	if (matches(*argv, "identify") == 0)
+		return ipvrf_identify(argc-1, argv+1);
+
+	if (matches(*argv, "pids") == 0)
+		return ipvrf_pids(argc-1, argv+1);
+
+	if (matches(*argv, "exec") == 0)
+		return ipvrf_exec(argc-1, argv+1);
+
+	if (matches(*argv, "help") == 0)
+		usage();
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"ip vrf help\".\n",
+		*argv);
+
+	exit(-1);
+}
diff --git a/man/man8/ip-vrf.8 b/man/man8/ip-vrf.8
new file mode 100644
index 000000000000..57a7c7692ce8
--- /dev/null
+++ b/man/man8/ip-vrf.8
@@ -0,0 +1,88 @@
+.TH IP\-VRF 8 "7 Dec 2016" "iproute2" "Linux"
+.SH NAME
+ip-vrf \- run a command against a vrf
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B ip
+.B vrf
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.BR "ip vrf identify"
+.RI "[ " PID " ]"
+
+.ti -8
+.BR "ip vrf pids"
+.I NAME
+
+.ti -8
+.BR "ip vrf exec "
+.RI "[ " NAME " ] " command ...
+
+.SH DESCRIPTION
+A VRF provides traffic isolation at layer 3 for routing, similar to how a
+VLAN is used to isolate traffic at layer 2. Fundamentally, a VRF is a separate
+routing table. Network devices are associated with a VRF by enslaving the
+device to the VRF. At that point network addresses assigned to the device are
+local to the VRF with host and connected routes moved to the table associated
+with the VRF.
+
+A process can specify a VRF using several APIs -- binding the socket to the
+VRF device using SO_BINDTODEVICE, setting the VRF association using
+IP_UNICAST_IF or IPV6_UNICAST_IF, or specifying the VRF for a specific message
+using IP_PKTINFO or IPV6_PKTINFO.
+
+By default a process is not bound to any VRF. An association can be set
+explicitly by making the program use one of the APIs mentioned above or
+implicitly using a helper to set SO_BINDTODEVICE for all IPv4 and IPv6
+sockets (AF_INET and AF_INET6) when the socket is created. This ip-vrf command
+is a helper to run a command against a specific VRF with the VRF association
+inherited parent to child.
+
+.TP
+.B ip vrf exec [ NAME ] cmd ... - Run cmd against the named VRF
+.sp
+This command allows applications that are VRF unaware to be run against
+a VRF other than the default VRF (main table). A command can be run against
+the default VRF by passing the "default" as the VRF name. This is useful if
+the current shell is associated with another VRF (e.g, Management VRF).
+
+.TP
+.B ip vrf identify [PID] - Report VRF association for process
+.sp
+This command shows the VRF association of the specified process. If PID is
+not specified then the id of the current process is used.
+
+.TP
+.B ip vrf pids NAME - Report processes associated with the named VRF
+.sp
+This command shows all process ids that are associated with the given
+VRF.
+
+.SH CAVEATS
+This command requires a kernel compiled with CGROUPS and CGROUP_BPF enabled.
+
+The VRF helper *only* affects network layer sockets.
+
+.SH EXAMPLES
+.PP
+ip vrf exec red ssh 10.100.1.254
+.RS
+Executes ssh to 10.100.1.254 against the VRF red table.
+.RE
+
+.SH SEE ALSO
+.br
+.BR ip (8),
+.BR ip-link (8),
+.BR ip-address (8),
+.BR ip-route (8),
+.BR ip-neighbor (8)
+
+.SH AUTHOR
+Original Manpage by David Ahern
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 7/8] libnetlink: Add variant of rtnl_talk that does not display RTNETLINK answers error
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

iplink_vrf has 2 functions used to validate a user given device name is
a VRF device and to return the table id. If the user string is not a
device name ip commands with a vrf keyword show a confusing error
message: "RTNETLINK answers: No such device".

Add a variant of rtnl_talk that does not display the "RTNETLINK answers"
message and update iplink_vrf to use it.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/libnetlink.h |  3 +++
 ip/iplink_vrf.c      | 14 +++++++++++---
 lib/libnetlink.c     | 20 +++++++++++++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index 751ebf186dd4..bd0267dfcc02 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -81,6 +81,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr *answer, size_t len)
 	__attribute__((warn_unused_result));
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t len)
+	__attribute__((warn_unused_result));
 int rtnl_send(struct rtnl_handle *rth, const void *buf, int)
 	__attribute__((warn_unused_result));
 int rtnl_send_check(struct rtnl_handle *rth, const void *buf, int)
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index c101ed770f87..917630e85337 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <sys/socket.h>
 #include <linux/if_link.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -126,8 +127,14 @@ __u32 ipvrf_get_table(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return 0;
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0) {
+		/* special case "default" vrf to be the main table */
+		if (errno == ENODEV && !strcmp(name, "default"))
+			rtnl_rttable_a2n(&tb_id, "main");
+
+		return tb_id;
+	}
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
@@ -186,7 +193,8 @@ int name_is_vrf(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0)
 		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index a5db168e50eb..9d7e89aebbd0 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -12,6 +12,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <unistd.h>
 #include <syslog.h>
 #include <fcntl.h>
@@ -397,8 +398,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 	return rtnl_dump_filter_l(rth, a);
 }
 
-int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t maxlen)
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+		       struct nlmsghdr *answer, size_t maxlen,
+		       bool show_rtnl_err)
 {
 	int status;
 	unsigned int seq;
@@ -485,7 +487,7 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					return 0;
 				}
 
-				if (rtnl->proto != NETLINK_SOCK_DIAG)
+				if (rtnl->proto != NETLINK_SOCK_DIAG && show_rtnl_err)
 					fprintf(stderr,
 						"RTNETLINK answers: %s\n",
 						strerror(-err->error));
@@ -517,6 +519,18 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	}
 }
 
+int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+	      struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, true);
+}
+
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, false);
+}
+
 int rtnl_listen_all_nsid(struct rtnl_handle *rth)
 {
 	unsigned int on = 1;
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 6/8] change name_is_vrf to return index
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

index of 0 means name is not a valid vrf.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/ip_common.h  |  2 +-
 ip/iplink_vrf.c | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 0147f45a7a31..3162f1ca5b2c 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -91,7 +91,7 @@ struct link_util *get_link_kind(const char *kind);
 void br_dump_bridge_id(const struct ifla_bridge_id *id, char *buf, size_t len);
 
 __u32 ipvrf_get_table(const char *name);
-bool name_is_vrf(const char *name);
+int name_is_vrf(const char *name);
 
 #ifndef	INFINITY_LIFE_TIME
 #define     INFINITY_LIFE_TIME      0xFFFFFFFFU
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index a238b2906805..c101ed770f87 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -159,7 +159,7 @@ __u32 ipvrf_get_table(const char *name)
 	return tb_id;
 }
 
-bool name_is_vrf(const char *name)
+int name_is_vrf(const char *name)
 {
 	struct {
 		struct nlmsghdr		n;
@@ -187,24 +187,27 @@ bool name_is_vrf(const char *name)
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
 	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return false;
+		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	if (len < 0) {
 		fprintf(stderr, "BUG: Invalid response to link query.\n");
-		return false;
+		return 0;
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
 
 	if (!tb[IFLA_LINKINFO])
-		return false;
+		return 0;
 
 	parse_rtattr_nested(li, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
 
 	if (!li[IFLA_INFO_KIND])
-		return false;
+		return 0;
+
+	if (strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf"))
+		return 0;
 
-	return strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf") == 0;
+	return ifi->ifi_index;
 }
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 4/8] move cmd_exec to lib utils
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/utils.h |  2 ++
 ip/ipnetns.c    | 34 ----------------------------------
 lib/Makefile    |  2 +-
 lib/exec.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 35 deletions(-)
 create mode 100644 lib/exec.c

diff --git a/include/utils.h b/include/utils.h
index 26c970daa5d0..ac4517a3bde1 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -256,4 +256,6 @@ char *int_to_str(int val, char *buf);
 int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
 
+int cmd_exec(const char *cmd, char **argv, bool do_fork);
+
 #endif /* __UTILS_H__ */
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index bd1e9013706c..db9a541769f1 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -357,40 +357,6 @@ static int netns_list(int argc, char **argv)
 	return 0;
 }
 
-static int cmd_exec(const char *cmd, char **argv, bool do_fork)
-{
-	fflush(stdout);
-	if (do_fork) {
-		int status;
-		pid_t pid;
-
-		pid = fork();
-		if (pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-
-		if (pid != 0) {
-			/* Parent  */
-			if (waitpid(pid, &status, 0) < 0) {
-				perror("waitpid");
-				exit(1);
-			}
-
-			if (WIFEXITED(status)) {
-				return WEXITSTATUS(status);
-			}
-
-			exit(1);
-		}
-	}
-
-	if (execvp(cmd, argv)  < 0)
-		fprintf(stderr, "exec of \"%s\" failed: %s\n",
-				cmd, strerror(errno));
-	_exit(1);
-}
-
 static int on_netns_exec(char *nsname, void *arg)
 {
 	char **argv = arg;
diff --git a/lib/Makefile b/lib/Makefile
index 5b7ec169048a..749073261c49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o \
-	names.o color.o bpf.o
+	names.o color.o bpf.o exec.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
 
diff --git a/lib/exec.c b/lib/exec.c
new file mode 100644
index 000000000000..96edbc422e84
--- /dev/null
+++ b/lib/exec.c
@@ -0,0 +1,41 @@
+#define _ATFILE_SOURCE
+#include <sys/wait.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+int cmd_exec(const char *cmd, char **argv, bool do_fork)
+{
+	fflush(stdout);
+	if (do_fork) {
+		int status;
+		pid_t pid;
+
+		pid = fork();
+		if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+
+		if (pid != 0) {
+			/* Parent  */
+			if (waitpid(pid, &status, 0) < 0) {
+				perror("waitpid");
+				exit(1);
+			}
+
+			if (WIFEXITED(status)) {
+				return WEXITSTATUS(status);
+			}
+
+			exit(1);
+		}
+	}
+
+	if (execvp(cmd, argv)  < 0)
+		fprintf(stderr, "exec of \"%s\" failed: %s\n",
+				cmd, strerror(errno));
+	_exit(1);
+}
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 5/8] Add filesystem APIs to lib
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

Add make_path to recursively call mkdir as needed to create a given
path with the given mode.

Add find_cgroup2_mount to lookup path where cgroup2 is mounted. If it
is not already mounted, cgroup2 is mounted under /var/run/cgroup2 for
use by iproute2.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/utils.h |   2 +
 lib/Makefile    |   2 +-
 lib/fs.c        | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 lib/fs.c

diff --git a/include/utils.h b/include/utils.h
index ac4517a3bde1..dc1d6b9607dd 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -257,5 +257,7 @@ int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
 
 int cmd_exec(const char *cmd, char **argv, bool do_fork);
+int make_path(const char *path, mode_t mode);
+char *find_cgroup2_mount(void);
 
 #endif /* __UTILS_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index 749073261c49..0c57662b4f8f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o \
-	names.o color.o bpf.o exec.o
+	names.o color.o bpf.o exec.o fs.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
 
diff --git a/lib/fs.c b/lib/fs.c
new file mode 100644
index 000000000000..39cc96dccca9
--- /dev/null
+++ b/lib/fs.c
@@ -0,0 +1,143 @@
+/*
+ * fs.c         filesystem APIs
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "utils.h"
+
+#define CGROUP2_FS_NAME "cgroup2"
+
+/* if not already mounted cgroup2 is mounted here for iproute2's use */
+#define MNT_CGRP2_PATH  "/var/run/cgroup2"
+
+/* return mount path of first occurrence of given fstype */
+static char *find_fs_mount(const char *fs_to_find)
+{
+	char path[4096];
+	char fstype[128];    /* max length of any filesystem name */
+	char *mnt = NULL;
+	FILE *fp;
+
+	fp = fopen("/proc/mounts", "r");
+	if (!fp) {
+		fprintf(stderr,
+			"Failed to open mounts file: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	while (fscanf(fp, "%*s %4096s %127s %*s %*d %*d\n",
+		      path, fstype) == 2) {
+		if (strcmp(fstype, fs_to_find) == 0) {
+			mnt = strdup(path);
+			break;
+		}
+	}
+
+	fclose(fp);
+
+	return mnt;
+}
+
+/* caller needs to free string returned */
+char *find_cgroup2_mount(void)
+{
+	char *mnt = find_fs_mount(CGROUP2_FS_NAME);
+
+	if (mnt)
+		return mnt;
+
+	mnt = strdup(MNT_CGRP2_PATH);
+	if (!mnt) {
+		fprintf(stderr, "Failed to allocate memory for cgroup2 path\n");
+		return NULL;
+
+	}
+
+	if (make_path(mnt, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		free(mnt);
+		return NULL;
+	}
+
+	if (mount("none", mnt, CGROUP2_FS_NAME, 0, NULL)) {
+		/* EBUSY means already mounted */
+		if (errno != EBUSY) {
+			fprintf(stderr,
+				"Failed to mount cgroup2. Are CGROUPS enabled in your kernel?\n");
+			free(mnt);
+			return NULL;
+		}
+	}
+	return mnt;
+}
+
+int make_path(const char *path, mode_t mode)
+{
+	char *dir, *delim;
+	struct stat sbuf;
+	int rc = -1;
+
+	delim = dir = strdup(path);
+	if (dir == NULL) {
+		fprintf(stderr, "strdup failed copying path");
+		return -1;
+	}
+
+	/* skip '/' -- it had better exist */
+	if (*delim == '/')
+		delim++;
+
+	while (1) {
+		delim = strchr(delim, '/');
+		if (delim)
+			*delim = '\0';
+
+		if (stat(dir, &sbuf) != 0) {
+			if (errno != ENOENT) {
+				fprintf(stderr,
+					"stat failed for %s: %s\n",
+					dir, strerror(errno));
+				goto out;
+			}
+
+			if (mkdir(dir, mode) != 0) {
+				fprintf(stderr,
+					"mkdir failed for %s: %s",
+					dir, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (delim == NULL)
+			break;
+
+		*delim = '/';
+		delim++;
+		if (*delim == '\0')
+			break;
+	}
+	rc = 0;
+out:
+	free(dir);
+
+	return rc;
+}
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 3/8] Add libbpf.h header with BPF_ macros
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

Based on version in kernel repo, samples/bpf/libbpf.h

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/libbpf.h | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 include/libbpf.h

diff --git a/include/libbpf.h b/include/libbpf.h
new file mode 100644
index 000000000000..37951f509a10
--- /dev/null
+++ b/include/libbpf.h
@@ -0,0 +1,184 @@
+/* eBPF mini library */
+#ifndef __LIBBPF_H
+#define __LIBBPF_H
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_MOV32_REG(DST, SRC)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM)					\
+	BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = (__u32) (IMM) }),			\
+	((struct bpf_insn) {					\
+		.code  = 0, /* zero is reserved opcode */	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = ((__u64) (IMM)) >> 32 })
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD	1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD)				\
+	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
+	((struct bpf_insn) {					\
+		.code  = CODE,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN()						\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_EXIT,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#endif
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 2/8] bpf: export bpf_prog_load
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/bpf_util.h |  3 +++
 lib/bpf.c          | 40 ++++++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 49b96bbc208f..dcbdca6978d6 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -75,6 +75,9 @@ int bpf_trace_pipe(void);
 
 void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len);
 
+int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		  size_t size_insns, const char *license, char *log,
+		  size_t size_log);
 int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type);
 
diff --git a/lib/bpf.c b/lib/bpf.c
index 103fc1ef0593..b04c3a678b9c 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -871,6 +871,26 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 	return bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
 }
 
+int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		  size_t size_insns, const char *license, char *log,
+		  size_t size_log)
+{
+	union bpf_attr attr = {};
+
+	attr.prog_type = type;
+	attr.insns = bpf_ptr_to_u64(insns);
+	attr.insn_cnt = size_insns / sizeof(struct bpf_insn);
+	attr.license = bpf_ptr_to_u64(license);
+
+	if (size_log > 0) {
+		attr.log_buf = bpf_ptr_to_u64(log);
+		attr.log_size = size_log;
+		attr.log_level = 1;
+	}
+
+	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
 #ifdef HAVE_ELF
 struct bpf_elf_prog {
 	enum bpf_prog_type	type;
@@ -988,26 +1008,6 @@ static int bpf_map_create(enum bpf_map_type type, uint32_t size_key,
 	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
 
-static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
-			 size_t size_insns, const char *license, char *log,
-			 size_t size_log)
-{
-	union bpf_attr attr = {};
-
-	attr.prog_type = type;
-	attr.insns = bpf_ptr_to_u64(insns);
-	attr.insn_cnt = size_insns / sizeof(struct bpf_insn);
-	attr.license = bpf_ptr_to_u64(license);
-
-	if (size_log > 0) {
-		attr.log_buf = bpf_ptr_to_u64(log);
-		attr.log_size = size_log;
-		attr.log_level = 1;
-	}
-
-	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
-}
-
 static int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr = {};
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 1/8] lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481401934-4026-1-git-send-email-dsa@cumulusnetworks.com>

For consistency with other bpf commands, the functions are named
bpf_prog_attach and bpf_prog_detach. The existing bpf_prog_attach is
renamed to bpf_prog_load_and_report since it calls bpf_prog_load and
bpf_prog_report.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/bpf_util.h |  3 +++
 lib/bpf.c          | 31 ++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 05baeecda57f..49b96bbc208f 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -75,6 +75,9 @@ int bpf_trace_pipe(void);
 
 void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len);
 
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type);
+
 #ifdef HAVE_ELF
 int bpf_send_map_fds(const char *path, const char *obj);
 int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
diff --git a/lib/bpf.c b/lib/bpf.c
index 2a8cd51d4dae..103fc1ef0593 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -850,6 +850,27 @@ int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv)
 	return ret;
 }
 
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_bpf_fd = prog_fd,
+		.attach_type = type,
+	};
+
+	return bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_type = type,
+	};
+
+	return bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
 #ifdef HAVE_ELF
 struct bpf_elf_prog {
 	enum bpf_prog_type	type;
@@ -1262,9 +1283,9 @@ static void bpf_prog_report(int fd, const char *section,
 	bpf_dump_error(ctx, "Verifier analysis:\n\n");
 }
 
-static int bpf_prog_attach(const char *section,
-			   const struct bpf_elf_prog *prog,
-			   struct bpf_elf_ctx *ctx)
+static int bpf_prog_load_and_report(const char *section,
+				    const struct bpf_elf_prog *prog,
+				    struct bpf_elf_ctx *ctx)
 {
 	int tries = 0, fd;
 retry:
@@ -1656,7 +1677,7 @@ static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section,
 		prog.size    = data.sec_data->d_size;
 		prog.license = ctx->license;
 
-		fd = bpf_prog_attach(section, &prog, ctx);
+		fd = bpf_prog_load_and_report(section, &prog, ctx);
 		if (fd < 0)
 			return fd;
 
@@ -1755,7 +1776,7 @@ static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section,
 		prog.size    = data_insn.sec_data->d_size;
 		prog.license = ctx->license;
 
-		fd = bpf_prog_attach(section, &prog, ctx);
+		fd = bpf_prog_load_and_report(section, &prog, ctx);
 		if (fd < 0) {
 			*lderr = true;
 			return fd;
-- 
2.1.4

^ permalink raw reply related

* [iproute2 v2 net-next 0/8] Add support for vrf helper
From: David Ahern @ 2016-12-10 20:32 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern

This series adds support to iproute2 to run a command against a specific
VRF. The user semnatics are similar to 'ip netns'.

The 'ip vrf' subcommand supports 3 usages:

1. Run a command against a given vrf:
       ip vrf exec NAME CMD

   Uses the recently committed cgroup/sock BPF option. vrf directory
   is added to cgroup2 mount. Individual vrfs are created under it. BPF
   filter is attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the
   device index of the VRF. From there the current process (ip's pid) is
   addded to the cgroups.proc file and the given command is exected. In
   doing so all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically
   bound to the VRF domain.

   The association is inherited parent to child allowing the command to
   be a shell from which other commands are run relative to the VRF.

2. Show the VRF a process is bound to:
       ip vrf id
   This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
   entry.

3. Show process ids bound to a VRF
       ip vrf pids NAME
   This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
   shows the process ids in the particular vrf cgroup.

v2
- updated suject of patch 3 to avoid spam filters on vger

David Ahern (8):
  lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
  bpf: export bpf_prog_load
  Add libbpf.h header with BPF_ macros
  move cmd_exec to lib utils
  Add filesystem APIs to lib
  change name_is_vrf to return index
  libnetlink: Add variant of rtnl_talk that does not display RTNETLINK
    answers error
  Introduce ip vrf command

 include/bpf_util.h   |   6 ++
 include/libbpf.h     | 184 ++++++++++++++++++++++++++++++++
 include/libnetlink.h |   3 +
 include/utils.h      |   4 +
 ip/Makefile          |   3 +-
 ip/ip.c              |   4 +-
 ip/ip_common.h       |   4 +-
 ip/iplink_vrf.c      |  29 ++++--
 ip/ipnetns.c         |  34 ------
 ip/ipvrf.c           | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile         |   2 +-
 lib/bpf.c            |  71 ++++++++-----
 lib/exec.c           |  41 ++++++++
 lib/fs.c             | 143 +++++++++++++++++++++++++
 lib/libnetlink.c     |  20 +++-
 man/man8/ip-vrf.8    |  88 ++++++++++++++++
 16 files changed, 850 insertions(+), 75 deletions(-)
 create mode 100644 include/libbpf.h
 create mode 100644 ip/ipvrf.c
 create mode 100644 lib/exec.c
 create mode 100644 lib/fs.c
 create mode 100644 man/man8/ip-vrf.8

-- 
2.1.4

^ permalink raw reply

* Re: Misalignment, MIPS, and ip_hdr(skb)->version
From: Felix Fietkau @ 2016-12-10 20:09 UTC (permalink / raw)
  To: Måns Rullgård
  Cc: Jason A. Donenfeld, David Miller, Netdev, WireGuard mailing list,
	LKML, linux-mips
In-Reply-To: <yw1x37hvykzk.fsf@unicorn.mansr.com>

On 2016-12-10 14:25, Måns Rullgård wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> On 2016-12-07 19:54, Jason A. Donenfeld wrote:
>>> On Wed, Dec 7, 2016 at 7:51 PM, David Miller <davem@davemloft.net> wrote:
>>>> It's so much better to analyze properly where the misalignment comes from
>>>> and address it at the source, as we have for various cases that trip up
>>>> Sparc too.
>>> 
>>> That's sort of my attitude too, hence starting this thread. Any
>>> pointers you have about this would be most welcome, so as not to
>>> perpetuate what already seems like an issue in other parts of the
>>> stack.
>> Hi Jason,
>>
>> I'm the author of that hackish LEDE/OpenWrt patch that works around the
>> misalignment issues. Here's some context regarding that patch:
>>
>> I intentionally put it in the target specific patches for only one of
>> our MIPS targets. There are a few ar71xx devices where the misalignment
>> cannot be fixed, because the Ethernet MAC has a 4-byte DMA alignment
>> requirement, and does not support inserting 2 bytes of padding to
>> correct the IP header misalignment.
>>
>> With these limitations the choice was between this ugly network stack
>> patch or inserting a very expensive memmove in the data path (which is
>> better than taking the mis-alignment traps, but still hurts routing
>> performance significantly).
> 
> I solved this problem in an Ethernet driver by copying the initial part
> of the packet to an aligned skb and appending the remainder using
> skb_add_rx_frag().  The kernel network stack only cares about the
> headers, so the alignment of the packet payload doesn't matter.
I considered that as well, but it's bad for routing performance if the
ethernet MAC does not support scatter/gather for xmit.
Unfortunately that limitation is quite common on embedded hardware.

- Felix

^ permalink raw reply

* [PATCH net 3/3] net: bridge: shorten ageing time on topology change
From: Vivien Didelot @ 2016-12-10 18:44 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Stephen Hemminger,
	Florian Fainelli, Andrew Lunn, Ido Schimmel, Jiri Pirko,
	Nikolay Aleksandrov, cphealy, bridge, Vivien Didelot
In-Reply-To: <20161210184429.31600-1-vivien.didelot@savoirfairelinux.com>

802.1D [1] specifies that the bridges must use a short value to age out
dynamic entries in the Filtering Database for a period, once a topology
change has been communicated by the root bridge.

Add a bridge_ageing_time member in the net_bridge structure to store the
bridge ageing time value configured by the user (ioctl/netlink/sysfs).

If we are using in-kernel STP, shorten the ageing time value to twice
the forward delay used by the topology when the topology change flag is
set. When the flag is cleared, restore the configured ageing time.

[1] "8.3.5 Notifying topology changes ",
    http://profesores.elo.utfsm.cl/~agv/elo309/doc/802.1D-1998.pdf

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 net/bridge/br_device.c  |  2 +-
 net/bridge/br_private.h |  3 ++-
 net/bridge/br_stp.c     | 27 +++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 89a687f..207318a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -409,7 +409,7 @@ void br_dev_setup(struct net_device *dev)
 	br->bridge_max_age = br->max_age = 20 * HZ;
 	br->bridge_hello_time = br->hello_time = 2 * HZ;
 	br->bridge_forward_delay = br->forward_delay = 15 * HZ;
-	br->ageing_time = BR_DEFAULT_AGEING_TIME;
+	br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME;
 
 	br_netfilter_rtable_init(br);
 	br_stp_timer_init(br);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3c294b4..43efeb9 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -300,10 +300,11 @@ struct net_bridge
 	unsigned long			max_age;
 	unsigned long			hello_time;
 	unsigned long			forward_delay;
-	unsigned long			bridge_max_age;
 	unsigned long			ageing_time;
+	unsigned long			bridge_max_age;
 	unsigned long			bridge_hello_time;
 	unsigned long			bridge_forward_delay;
+	unsigned long			bridge_ageing_time;
 
 	u8				group_addr[ETH_ALEN];
 	bool				group_addr_set;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 8d7b4c7..71fd1a4 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -597,7 +597,11 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	if (err)
 		return err;
 
+	spin_lock_bh(&br->lock);
+	br->bridge_ageing_time = t;
 	br->ageing_time = t;
+	spin_unlock_bh(&br->lock);
+
 	mod_timer(&br->gc_timer, jiffies);
 
 	return 0;
@@ -606,6 +610,29 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 /* called under bridge lock */
 void __br_set_topology_change(struct net_bridge *br, unsigned char val)
 {
+	unsigned long t;
+	int err;
+
+	if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) {
+		/* On topology change, set the bridge ageing time to twice the
+		 * forward delay. Otherwise, restore its default ageing time.
+		 */
+
+		if (val) {
+			t = 2 * br->forward_delay;
+			br_debug(br, "decreasing ageing time to %lu\n", t);
+		} else {
+			t = br->bridge_ageing_time;
+			br_debug(br, "restoring ageing time to %lu\n", t);
+		}
+
+		err = __set_ageing_time(br->dev, t);
+		if (err)
+			br_warn(br, "error offloading ageing time\n");
+		else
+			br->ageing_time = t;
+	}
+
 	br->topology_change = val;
 }
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH net 2/3] net: bridge: add helper to set topology change
From: Vivien Didelot @ 2016-12-10 18:44 UTC (permalink / raw)
  To: netdev
  Cc: Ido Schimmel, Andrew Lunn, Florian Fainelli, Vivien Didelot,
	Nikolay Aleksandrov, bridge, linux-kernel, Jiri Pirko, kernel,
	David S. Miller
In-Reply-To: <20161210184429.31600-1-vivien.didelot@savoirfairelinux.com>

Add a __br_set_topology_change helper to set the topology change value.

This can be later extended to add actions when the topology change flag
is set or cleared.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 net/bridge/br_private_stp.h |  1 +
 net/bridge/br_stp.c         | 10 ++++++++--
 net/bridge/br_stp_if.c      |  2 +-
 net/bridge/br_stp_timer.c   |  2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index 2fe910c..3f7543a 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -61,6 +61,7 @@ void br_received_tcn_bpdu(struct net_bridge_port *p);
 void br_transmit_config(struct net_bridge_port *p);
 void br_transmit_tcn(struct net_bridge *br);
 void br_topology_change_detection(struct net_bridge *br);
+void __br_set_topology_change(struct net_bridge *br, unsigned char val);
 
 /* br_stp_bpdu.c */
 void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 6ebe2a0..8d7b4c7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -234,7 +234,7 @@ static void br_record_config_timeout_values(struct net_bridge *br,
 	br->max_age = bpdu->max_age;
 	br->hello_time = bpdu->hello_time;
 	br->forward_delay = bpdu->forward_delay;
-	br->topology_change = bpdu->topology_change;
+	__br_set_topology_change(br, bpdu->topology_change);
 }
 
 /* called under bridge lock */
@@ -344,7 +344,7 @@ void br_topology_change_detection(struct net_bridge *br)
 		isroot ? "propagating" : "sending tcn bpdu");
 
 	if (isroot) {
-		br->topology_change = 1;
+		__br_set_topology_change(br, 1);
 		mod_timer(&br->topology_change_timer, jiffies
 			  + br->bridge_forward_delay + br->bridge_max_age);
 	} else if (!br->topology_change_detected) {
@@ -603,6 +603,12 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	return 0;
 }
 
+/* called under bridge lock */
+void __br_set_topology_change(struct net_bridge *br, unsigned char val)
+{
+	br->topology_change = val;
+}
+
 void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
 {
 	br->bridge_forward_delay = t;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 2efbba5..6c1e214 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -81,7 +81,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
 
 	}
 
-	br->topology_change = 0;
+	__br_set_topology_change(br, 0);
 	br->topology_change_detected = 0;
 	spin_unlock_bh(&br->lock);
 
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index da058b8..7ddb38e 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -125,7 +125,7 @@ static void br_topology_change_timer_expired(unsigned long arg)
 	br_debug(br, "topo change timer expired\n");
 	spin_lock(&br->lock);
 	br->topology_change_detected = 0;
-	br->topology_change = 0;
+	__br_set_topology_change(br, 0);
 	spin_unlock(&br->lock);
 }
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH net 1/3] net: bridge: add helper to offload ageing time
From: Vivien Didelot @ 2016-12-10 18:44 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Stephen Hemminger,
	Florian Fainelli, Andrew Lunn, Ido Schimmel, Jiri Pirko,
	Nikolay Aleksandrov, cphealy, bridge, Vivien Didelot
In-Reply-To: <20161210184429.31600-1-vivien.didelot@savoirfairelinux.com>

The SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME switchdev attr is actually set
when initializing a bridge port, and when configuring the bridge ageing
time from ioctl/netlink/sysfs.

Add a __set_ageing_time helper to offload the ageing time to physical
switches, and add the SWITCHDEV_F_DEFER flag since it can be called
under bridge lock.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 net/bridge/br_private.h |  1 +
 net/bridge/br_stp.c     | 28 ++++++++++++++++++++--------
 net/bridge/br_stp_if.c  | 12 +++---------
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b63177..3c294b4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -992,6 +992,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
 int br_set_forward_delay(struct net_bridge *br, unsigned long x);
 int br_set_hello_time(struct net_bridge *br, unsigned long x);
 int br_set_max_age(struct net_bridge *br, unsigned long x);
+int __set_ageing_time(struct net_device *dev, unsigned long t);
 int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
 
 
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 9258b8e..6ebe2a0 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -562,6 +562,24 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
 
 }
 
+/* called under bridge lock */
+int __set_ageing_time(struct net_device *dev, unsigned long t)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = dev,
+		.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
+		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
+		.u.ageing_time = jiffies_to_clock_t(t),
+	};
+	int err;
+
+	err = switchdev_port_attr_set(dev, &attr);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
+
 /* Set time interval that dynamic forwarding entries live
  * For pure software bridge, allow values outside the 802.1
  * standard specification for special cases:
@@ -572,17 +590,11 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
  */
 int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 {
-	struct switchdev_attr attr = {
-		.orig_dev = br->dev,
-		.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
-		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
-		.u.ageing_time = ageing_time,
-	};
 	unsigned long t = clock_t_to_jiffies(ageing_time);
 	int err;
 
-	err = switchdev_port_attr_set(br->dev, &attr);
-	if (err && err != -EOPNOTSUPP)
+	err = __set_ageing_time(br->dev, t);
+	if (err)
 		return err;
 
 	br->ageing_time = t;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d8ad73b..2efbba5 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -36,12 +36,6 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
 /* called under bridge lock */
 void br_init_port(struct net_bridge_port *p)
 {
-	struct switchdev_attr attr = {
-		.orig_dev = p->dev,
-		.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
-		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
-		.u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
-	};
 	int err;
 
 	p->port_id = br_make_port_id(p->priority, p->port_no);
@@ -50,9 +44,9 @@ void br_init_port(struct net_bridge_port *p)
 	p->topology_change_ack = 0;
 	p->config_pending = 0;
 
-	err = switchdev_port_attr_set(p->dev, &attr);
-	if (err && err != -EOPNOTSUPP)
-		netdev_err(p->dev, "failed to set HW ageing time\n");
+	err = __set_ageing_time(p->dev, p->br->ageing_time);
+	if (err)
+		netdev_err(p->dev, "failed to offload ageing time\n");
 }
 
 /* NO locks held */
-- 
2.10.2

^ permalink raw reply related

* [PATCH net 0/3] net: bridge: fast ageing on topology change
From: Vivien Didelot @ 2016-12-10 18:44 UTC (permalink / raw)
  To: netdev
  Cc: Ido Schimmel, Andrew Lunn, Florian Fainelli, Vivien Didelot,
	Nikolay Aleksandrov, bridge, linux-kernel, Jiri Pirko, kernel,
	David S. Miller

802.1D [1] specifies that the bridges in a network must use a short
value to age out dynamic entries in the Filtering Database for a period,
once a topology change has been communicated by the root bridge.

This patchset fixes this for the in-kernel STP implementation.

Once the topology change flag is set in a net_bridge instance, the
ageing time value is shorten to twice the forward delay used by the
topology.

When the topology change flag is cleared, the ageing time configured for
the bridge is restored.

To accomplish that, a new bridge_ageing_time member is added to the
net_bridge structure, to store the user configured bridge ageing time.

Two helpers are added to offload the ageing time and set the topology
change flag in the net_bridge instance. Then the required logic is added
in the topology change helper if in-kernel STP is used.

This has been tested on the following topology:

    +--------------+
    | root bridge  |
    |  1  2  3  4  |
    +--+--+--+--+--+
       |  |  |  |      +--------+
       |  |  |  +------| laptop |
       |  |  |         +--------+
    +--+--+--+-----+
    |  1  2  3     |
    | slave bridge |
    +--------------+

When unplugging/replugging the laptop, the slave bridge (under test)
gets the topology change flag sent by the root bridge, and fast ageing
is triggered on the bridges. Once the topology change timer of the root
bridge expires, the topology change flag is cleared and the configured
ageing time is restored on the bridges.

A similar test has been done between two bridges under test.
When changing the forward delay of the root bridge with:

    # echo 3000 > /sys/class/net/br0/bridge/forward_delay

the ageing time correctly changes on both bridges from 300s to 60s while
the TOPOLOGY_CHANGE flag is present.

[1] "8.3.5 Notifying topology changes",
    http://profesores.elo.utfsm.cl/~agv/elo309/doc/802.1D-1998.pdf

No change since RFC: https://lkml.org/lkml/2016/10/19/828

Vivien Didelot (3):
  net: bridge: add helper to offload ageing time
  net: bridge: add helper to set topology change
  net: bridge: shorten ageing time on topology change

 net/bridge/br_device.c      |  2 +-
 net/bridge/br_private.h     |  4 ++-
 net/bridge/br_private_stp.h |  1 +
 net/bridge/br_stp.c         | 65 ++++++++++++++++++++++++++++++++++++++-------
 net/bridge/br_stp_if.c      | 14 +++-------
 net/bridge/br_stp_timer.c   |  2 +-
 6 files changed, 65 insertions(+), 23 deletions(-)

-- 
2.10.2

^ permalink raw reply

* [PATCH net v3] ibmveth: set correct gso_size and gso_type
From: Thomas Falcon @ 2016-12-10 18:39 UTC (permalink / raw)
  To: netdev; +Cc: brking, marcelo.leitner, pradeeps, jmaxwell37, zdai, eric.dumazet
In-Reply-To: <1481333480-10827-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

This patch is based on an earlier one submitted
by Jon Maxwell with the following commit message:

"We recently encountered a bug where a few customers using ibmveth on the
same LPAR hit an issue where a TCP session hung when large receive was
enabled. Closer analysis revealed that the session was stuck because the
one side was advertising a zero window repeatedly.

We narrowed this down to the fact the ibmveth driver did not set gso_size
which is translated by TCP into the MSS later up the stack. The MSS is
used to calculate the TCP window size and as that was abnormally large,
it was calculating a zero window, even although the sockets receive buffer
was completely empty."

We rely on the Virtual I/O Server partition in a pseries
environment to provide the MSS through the TCP header checksum
field. The stipulation is that users should not disable checksum
offloading if rx packet aggregation is enabled through VIOS.

Some firmware offerings provide the MSS in the RX buffer.
This is signalled by a bit in the RX queue descriptor.

Reviewed-by: Brian King <brking@linux.vnet.ibm.com>
Reviewed-by: Pradeep Satyanarayana <pradeeps@linux.vnet.ibm.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Jonathan Maxwell <jmaxwell37@gmail.com>
Reviewed-by: David Dai <zdai@us.ibm.com>
Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
v3: include a check for non-zero mss when calculating gso_segs

v2: calculate gso_segs after Eric Dumazet's comments on the earlier patch
    and make sure everyone is included on CC
---
 drivers/net/ethernet/ibm/ibmveth.c | 72 ++++++++++++++++++++++++++++++++++++--
 drivers/net/ethernet/ibm/ibmveth.h |  1 +
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index ebe6071..6dc24a1 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -58,7 +58,7 @@
 
 static const char ibmveth_driver_name[] = "ibmveth";
 static const char ibmveth_driver_string[] = "IBM Power Virtual Ethernet Driver";
-#define ibmveth_driver_version "1.05"
+#define ibmveth_driver_version "1.06"
 
 MODULE_AUTHOR("Santiago Leon <santil@linux.vnet.ibm.com>");
 MODULE_DESCRIPTION("IBM Power Virtual Ethernet Driver");
@@ -137,6 +137,11 @@ static inline int ibmveth_rxq_frame_offset(struct ibmveth_adapter *adapter)
 	return ibmveth_rxq_flags(adapter) & IBMVETH_RXQ_OFF_MASK;
 }
 
+static inline int ibmveth_rxq_large_packet(struct ibmveth_adapter *adapter)
+{
+	return ibmveth_rxq_flags(adapter) & IBMVETH_RXQ_LRG_PKT;
+}
+
 static inline int ibmveth_rxq_frame_length(struct ibmveth_adapter *adapter)
 {
 	return be32_to_cpu(adapter->rx_queue.queue_addr[adapter->rx_queue.index].length);
@@ -1174,6 +1179,52 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
 	goto retry_bounce;
 }
 
+static void ibmveth_rx_mss_helper(struct sk_buff *skb, u16 mss, int lrg_pkt)
+{
+	struct tcphdr *tcph;
+	int offset = 0;
+	int hdr_len;
+
+	/* only TCP packets will be aggregated */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)skb->data;
+
+		if (iph->protocol == IPPROTO_TCP) {
+			offset = iph->ihl * 4;
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+		} else {
+			return;
+		}
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *iph6 = (struct ipv6hdr *)skb->data;
+
+		if (iph6->nexthdr == IPPROTO_TCP) {
+			offset = sizeof(struct ipv6hdr);
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+		} else {
+			return;
+		}
+	} else {
+		return;
+	}
+	/* if mss is not set through Large Packet bit/mss in rx buffer,
+	 * expect that the mss will be written to the tcp header checksum.
+	 */
+	tcph = (struct tcphdr *)(skb->data + offset);
+	hdr_len = offset + tcph->doff * 4;
+	if (lrg_pkt) {
+		skb_shinfo(skb)->gso_size = mss;
+	} else if (offset) {
+		skb_shinfo(skb)->gso_size = ntohs(tcph->check);
+		tcph->check = 0;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		skb_shinfo(skb)->gso_segs =
+				DIV_ROUND_UP(skb->len - hdr_len,
+					     skb_shinfo(skb)->gso_size);
+}
+
 static int ibmveth_poll(struct napi_struct *napi, int budget)
 {
 	struct ibmveth_adapter *adapter =
@@ -1182,6 +1233,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 	int frames_processed = 0;
 	unsigned long lpar_rc;
 	struct iphdr *iph;
+	u16 mss = 0;
 
 restart_poll:
 	while (frames_processed < budget) {
@@ -1199,9 +1251,21 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 			int length = ibmveth_rxq_frame_length(adapter);
 			int offset = ibmveth_rxq_frame_offset(adapter);
 			int csum_good = ibmveth_rxq_csum_good(adapter);
+			int lrg_pkt = ibmveth_rxq_large_packet(adapter);
 
 			skb = ibmveth_rxq_get_buffer(adapter);
 
+			/* if the large packet bit is set in the rx queue
+			 * descriptor, the mss will be written by PHYP eight
+			 * bytes from the start of the rx buffer, which is
+			 * skb->data at this stage
+			 */
+			if (lrg_pkt) {
+				__be64 *rxmss = (__be64 *)(skb->data + 8);
+
+				mss = (u16)be64_to_cpu(*rxmss);
+			}
+
 			new_skb = NULL;
 			if (length < rx_copybreak)
 				new_skb = netdev_alloc_skb(netdev, length);
@@ -1235,11 +1299,15 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 					if (iph->check == 0xffff) {
 						iph->check = 0;
 						iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-						adapter->rx_large_packets++;
 					}
 				}
 			}
 
+			if (length > netdev->mtu + ETH_HLEN) {
+				ibmveth_rx_mss_helper(skb, mss, lrg_pkt);
+				adapter->rx_large_packets++;
+			}
+
 			napi_gro_receive(napi, skb);	/* send it up */
 
 			netdev->stats.rx_packets++;
diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h
index 4eade67..7acda04 100644
--- a/drivers/net/ethernet/ibm/ibmveth.h
+++ b/drivers/net/ethernet/ibm/ibmveth.h
@@ -209,6 +209,7 @@ struct ibmveth_rx_q_entry {
 #define IBMVETH_RXQ_TOGGLE		0x80000000
 #define IBMVETH_RXQ_TOGGLE_SHIFT	31
 #define IBMVETH_RXQ_VALID		0x40000000
+#define IBMVETH_RXQ_LRG_PKT		0x04000000
 #define IBMVETH_RXQ_NO_CSUM		0x02000000
 #define IBMVETH_RXQ_CSUM_GOOD		0x01000000
 #define IBMVETH_RXQ_OFF_MASK		0x0000FFFF
-- 
1.8.3.1

^ permalink raw reply related

* [iproute2 net-next 8/8] Introduce ip vrf command
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

'ip vrf' follows the user semnatics established by 'ip netns'.

The 'ip vrf' subcommand supports 3 usages:

1. Run a command against a given vrf:
       ip vrf exec NAME CMD

   Uses the recently committed cgroup/sock BPF option. vrf directory
   is added to cgroup2 mount. Individual vrfs are created under it. BPF
   filter attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the VRF
   device index. From there the current process (ip's pid) is addded to
   the cgroups.proc file and the given command is exected. In doing so
   all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically bound to
   the VRF domain.

   The association is inherited parent to child allowing the command to
   be a shell from which other commands are run relative to the VRF.

2. Show the VRF a process is bound to:
       ip vrf id
   This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
   entry with the VRF name following.

3. Show process ids bound to a VRF
       ip vrf pids NAME
   This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
   shows the process ids in the particular vrf cgroup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/Makefile       |   3 +-
 ip/ip.c           |   4 +-
 ip/ip_common.h    |   2 +
 ip/ipvrf.c        | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man8/ip-vrf.8 |  88 +++++++++++++++++
 5 files changed, 384 insertions(+), 2 deletions(-)
 create mode 100644 ip/ipvrf.c
 create mode 100644 man/man8/ip-vrf.8

diff --git a/ip/Makefile b/ip/Makefile
index c8e6c6172741..1928489e7f90 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -7,7 +7,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
     iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
-    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o
+    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \
+    ipvrf.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index cb3adcb3f57d..07050b07592a 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -51,7 +51,8 @@ static void usage(void)
 "       ip [ -force ] -batch filename\n"
 "where  OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n"
-"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila }\n"
+"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila |\n"
+"                   vrf }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -h[uman-readable] | -iec |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n"
@@ -99,6 +100,7 @@ static const struct cmd {
 	{ "mrule",	do_multirule },
 	{ "netns",	do_netns },
 	{ "netconf",	do_ipnetconf },
+	{ "vrf",	do_ipvrf},
 	{ "help",	do_help },
 	{ 0 }
 };
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 3162f1ca5b2c..28763e81e4a4 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -57,6 +57,8 @@ extern int do_ipila(int argc, char **argv);
 int do_tcp_metrics(int argc, char **argv);
 int do_ipnetconf(int argc, char **argv);
 int do_iptoken(int argc, char **argv);
+int do_ipvrf(int argc, char **argv);
+
 int iplink_get(unsigned int flags, char *name, __u32 filt_mask);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
diff --git a/ip/ipvrf.c b/ip/ipvrf.c
new file mode 100644
index 000000000000..c4f0e53532e2
--- /dev/null
+++ b/ip/ipvrf.c
@@ -0,0 +1,289 @@
+/*
+ * ipvrf.c	"ip vrf"
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <linux/bpf.h>
+#include <linux/if.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define CGRP_PROC_FILE  "/cgroup.procs"
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip vrf exec [NAME] cmd ...\n");
+	fprintf(stderr, "       ip vrf identify [PID]\n");
+	fprintf(stderr, "       ip vrf pids [NAME]\n");
+
+	exit(-1);
+}
+
+static int ipvrf_identify(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *vrf, *end;
+	int fd, rc = -1;
+	unsigned int pid;
+	ssize_t n;
+
+	if (argc < 1)
+		pid = getpid();
+	else if (argc > 1)
+		invarg("Extra arguments specified\n", argv[1]);
+	else if (get_unsigned(&pid, argv[0], 10))
+		invarg("Invalid pid\n", argv[0]);
+
+	snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr,
+			"Failed to open cgroups file: %s\n", strerror(errno));
+		return -1;
+	}
+
+	n = read(fd, buf, sizeof(buf) - 1);
+	if (n < 0) {
+		fprintf(stderr,
+			"Failed to read cgroups file: %s\n", strerror(errno));
+		goto out;
+	}
+	buf[n] = '\0';
+	vrf = strstr(buf, "::/vrf/");
+	if (vrf) {
+		vrf += 7;  /* skip past "::/vrf/" */
+		end = strchr(vrf, '\n');
+		if (end)
+			*end = '\0';
+
+		printf("%s\n", vrf);
+	}
+
+	rc = 0;
+out:
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_pids(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *mnt, *vrf;
+	int fd, rc = -1;
+	ssize_t n;
+
+	if (argc != 1) {
+		fprintf(stderr, "Invalid arguments\n");
+		return -1;
+	}
+
+	vrf = argv[0];
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/vrf/%s%s", mnt, vrf, CGRP_PROC_FILE);
+	free(mnt);
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return 0; /* no cgroup file, nothing to show */
+
+	while (1) {
+		n = read(fd, buf, sizeof(buf) - 1);
+		if (n < 0) {
+			fprintf(stderr,
+				"Failed to read cgroups file: %s\n", strerror(errno));
+			break;
+		} else if (n == 0) {
+			rc = 0;
+			break;
+		}
+		printf("%s", buf);
+	}
+
+	close(fd);
+
+	return rc;
+}
+
+/* load BPF program to set sk_bound_dev_if for sockets */
+static char bpf_log_buf[256*1024];
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", bpf_log_buf, sizeof(bpf_log_buf));
+}
+
+static int vrf_configure_cgroup(const char *path, int ifindex)
+{
+	int rc = -1, cg_fd, prog_fd = -1;
+
+	cg_fd = open(path, O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		fprintf(stderr, "Failed to open cgroup path: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	/*
+	 * Load bpf program into kernel and attach to cgroup to affect
+	 * socket creates
+	 */
+	prog_fd = prog_load(ifindex);
+	if (prog_fd < 0) {
+		printf("Failed to load BPF prog: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	if (bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE)) {
+		fprintf(stderr, "Failed to attach prog to cgroup: '%s'\n",
+			strerror(errno));
+			fprintf(stderr, "Kernel compiled with CGROUP_BPF enabled?\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	close(cg_fd);
+	close(prog_fd);
+
+	return rc;
+}
+
+static int vrf_switch(const char *name)
+{
+	char path[PATH_MAX], *mnt, pid[16];
+	int ifindex = name_is_vrf(name);
+	bool default_vrf = false;
+	int rc = -1, len, fd = -1;
+
+	if (!ifindex) {
+		if (strcmp(name, "default")) {
+			fprintf(stderr, "Invalid VRF name\n");
+			return -1;
+		}
+		default_vrf = true;
+	}
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	/* path to cgroup; make sure buffer has room to cat "/cgroup.procs"
+	 * to the end of the path
+	 */
+	len = snprintf(path, sizeof(path) - sizeof(CGRP_PROC_FILE), "%s%s/%s",
+		       mnt, default_vrf ? "" : "/vrf", name);
+	if (len > sizeof(path) - sizeof(CGRP_PROC_FILE)) {
+		fprintf(stderr, "Invalid path to cgroup2 mount\n");
+		goto out;
+	}
+
+	if (make_path(path, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		goto out;
+	}
+
+	if (!default_vrf && vrf_configure_cgroup(path, ifindex))
+		goto out;
+
+	/*
+	 * write pid to cgroup.procs making process part of cgroup
+	 */
+	strcat(path, CGRP_PROC_FILE);
+	fd = open(path, O_RDWR | O_APPEND);
+	if (fd < 0) {
+		fprintf(stderr, "cgroups.procs file does not exist.\n");
+		goto out;
+	}
+
+	snprintf(pid, sizeof(pid), "%d", getpid());
+	if (write(fd, pid, strlen(pid)) < 0) {
+		fprintf(stderr, "Failed to join cgroup\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	free(mnt);
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_exec(int argc, char **argv)
+{
+	if (argc < 1) {
+		fprintf(stderr, "No VRF name specified\n");
+		return -1;
+	}
+	if (argc < 2) {
+		fprintf(stderr, "No command specified\n");
+		return -1;
+	}
+
+	if (vrf_switch(argv[0]))
+		return -1;
+
+	return -cmd_exec(argv[1], argv + 1, !!batch_mode);
+}
+
+int do_ipvrf(int argc, char **argv)
+{
+	if (argc == 0) {
+		fprintf(stderr, "No command given. Try \"ip vrf help\".\n");
+		exit(-1);
+	}
+
+	if (matches(*argv, "identify") == 0)
+		return ipvrf_identify(argc-1, argv+1);
+
+	if (matches(*argv, "pids") == 0)
+		return ipvrf_pids(argc-1, argv+1);
+
+	if (matches(*argv, "exec") == 0)
+		return ipvrf_exec(argc-1, argv+1);
+
+	if (matches(*argv, "help") == 0)
+		usage();
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"ip vrf help\".\n",
+		*argv);
+
+	exit(-1);
+}
diff --git a/man/man8/ip-vrf.8 b/man/man8/ip-vrf.8
new file mode 100644
index 000000000000..57a7c7692ce8
--- /dev/null
+++ b/man/man8/ip-vrf.8
@@ -0,0 +1,88 @@
+.TH IP\-VRF 8 "7 Dec 2016" "iproute2" "Linux"
+.SH NAME
+ip-vrf \- run a command against a vrf
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B ip
+.B vrf
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.BR "ip vrf identify"
+.RI "[ " PID " ]"
+
+.ti -8
+.BR "ip vrf pids"
+.I NAME
+
+.ti -8
+.BR "ip vrf exec "
+.RI "[ " NAME " ] " command ...
+
+.SH DESCRIPTION
+A VRF provides traffic isolation at layer 3 for routing, similar to how a
+VLAN is used to isolate traffic at layer 2. Fundamentally, a VRF is a separate
+routing table. Network devices are associated with a VRF by enslaving the
+device to the VRF. At that point network addresses assigned to the device are
+local to the VRF with host and connected routes moved to the table associated
+with the VRF.
+
+A process can specify a VRF using several APIs -- binding the socket to the
+VRF device using SO_BINDTODEVICE, setting the VRF association using
+IP_UNICAST_IF or IPV6_UNICAST_IF, or specifying the VRF for a specific message
+using IP_PKTINFO or IPV6_PKTINFO.
+
+By default a process is not bound to any VRF. An association can be set
+explicitly by making the program use one of the APIs mentioned above or
+implicitly using a helper to set SO_BINDTODEVICE for all IPv4 and IPv6
+sockets (AF_INET and AF_INET6) when the socket is created. This ip-vrf command
+is a helper to run a command against a specific VRF with the VRF association
+inherited parent to child.
+
+.TP
+.B ip vrf exec [ NAME ] cmd ... - Run cmd against the named VRF
+.sp
+This command allows applications that are VRF unaware to be run against
+a VRF other than the default VRF (main table). A command can be run against
+the default VRF by passing the "default" as the VRF name. This is useful if
+the current shell is associated with another VRF (e.g, Management VRF).
+
+.TP
+.B ip vrf identify [PID] - Report VRF association for process
+.sp
+This command shows the VRF association of the specified process. If PID is
+not specified then the id of the current process is used.
+
+.TP
+.B ip vrf pids NAME - Report processes associated with the named VRF
+.sp
+This command shows all process ids that are associated with the given
+VRF.
+
+.SH CAVEATS
+This command requires a kernel compiled with CGROUPS and CGROUP_BPF enabled.
+
+The VRF helper *only* affects network layer sockets.
+
+.SH EXAMPLES
+.PP
+ip vrf exec red ssh 10.100.1.254
+.RS
+Executes ssh to 10.100.1.254 against the VRF red table.
+.RE
+
+.SH SEE ALSO
+.br
+.BR ip (8),
+.BR ip-link (8),
+.BR ip-address (8),
+.BR ip-route (8),
+.BR ip-neighbor (8)
+
+.SH AUTHOR
+Original Manpage by David Ahern
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 7/8] libnetlink: Add variant of rtnl_talk that does not display RTNETLINK answers error
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

iplink_vrf has 2 functions used to validate a user given device name is
a VRF device and to return the table id. If the user string is not a
device name ip commands with a vrf keyword show a confusing error
message: "RTNETLINK answers: No such device".

Add a variant of rtnl_talk that does not display the "RTNETLINK answers"
message and update iplink_vrf to use it.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/libnetlink.h |  3 +++
 ip/iplink_vrf.c      | 14 +++++++++++---
 lib/libnetlink.c     | 20 +++++++++++++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index 751ebf186dd4..bd0267dfcc02 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -81,6 +81,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr *answer, size_t len)
 	__attribute__((warn_unused_result));
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t len)
+	__attribute__((warn_unused_result));
 int rtnl_send(struct rtnl_handle *rth, const void *buf, int)
 	__attribute__((warn_unused_result));
 int rtnl_send_check(struct rtnl_handle *rth, const void *buf, int)
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index c101ed770f87..917630e85337 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <sys/socket.h>
 #include <linux/if_link.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -126,8 +127,14 @@ __u32 ipvrf_get_table(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return 0;
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0) {
+		/* special case "default" vrf to be the main table */
+		if (errno == ENODEV && !strcmp(name, "default"))
+			rtnl_rttable_a2n(&tb_id, "main");
+
+		return tb_id;
+	}
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
@@ -186,7 +193,8 @@ int name_is_vrf(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0)
 		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index a5db168e50eb..9d7e89aebbd0 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -12,6 +12,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <unistd.h>
 #include <syslog.h>
 #include <fcntl.h>
@@ -397,8 +398,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 	return rtnl_dump_filter_l(rth, a);
 }
 
-int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t maxlen)
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+		       struct nlmsghdr *answer, size_t maxlen,
+		       bool show_rtnl_err)
 {
 	int status;
 	unsigned int seq;
@@ -485,7 +487,7 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					return 0;
 				}
 
-				if (rtnl->proto != NETLINK_SOCK_DIAG)
+				if (rtnl->proto != NETLINK_SOCK_DIAG && show_rtnl_err)
 					fprintf(stderr,
 						"RTNETLINK answers: %s\n",
 						strerror(-err->error));
@@ -517,6 +519,18 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	}
 }
 
+int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+	      struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, true);
+}
+
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, false);
+}
+
 int rtnl_listen_all_nsid(struct rtnl_handle *rth)
 {
 	unsigned int on = 1;
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 5/8] Add filesystem APIs to lib
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

Add make_path to recursively call mkdir as needed to create a given
path with the given mode.

Add find_cgroup2_mount to lookup path where cgroup2 is mounted. If it
is not already mounted, cgroup2 is mounted under /var/run/cgroup2 for
use by iproute2.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/utils.h |   2 +
 lib/Makefile    |   2 +-
 lib/fs.c        | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 lib/fs.c

diff --git a/include/utils.h b/include/utils.h
index ac4517a3bde1..dc1d6b9607dd 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -257,5 +257,7 @@ int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
 
 int cmd_exec(const char *cmd, char **argv, bool do_fork);
+int make_path(const char *path, mode_t mode);
+char *find_cgroup2_mount(void);
 
 #endif /* __UTILS_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index 749073261c49..0c57662b4f8f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o \
-	names.o color.o bpf.o exec.o
+	names.o color.o bpf.o exec.o fs.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
 
diff --git a/lib/fs.c b/lib/fs.c
new file mode 100644
index 000000000000..39cc96dccca9
--- /dev/null
+++ b/lib/fs.c
@@ -0,0 +1,143 @@
+/*
+ * fs.c         filesystem APIs
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "utils.h"
+
+#define CGROUP2_FS_NAME "cgroup2"
+
+/* if not already mounted cgroup2 is mounted here for iproute2's use */
+#define MNT_CGRP2_PATH  "/var/run/cgroup2"
+
+/* return mount path of first occurrence of given fstype */
+static char *find_fs_mount(const char *fs_to_find)
+{
+	char path[4096];
+	char fstype[128];    /* max length of any filesystem name */
+	char *mnt = NULL;
+	FILE *fp;
+
+	fp = fopen("/proc/mounts", "r");
+	if (!fp) {
+		fprintf(stderr,
+			"Failed to open mounts file: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	while (fscanf(fp, "%*s %4096s %127s %*s %*d %*d\n",
+		      path, fstype) == 2) {
+		if (strcmp(fstype, fs_to_find) == 0) {
+			mnt = strdup(path);
+			break;
+		}
+	}
+
+	fclose(fp);
+
+	return mnt;
+}
+
+/* caller needs to free string returned */
+char *find_cgroup2_mount(void)
+{
+	char *mnt = find_fs_mount(CGROUP2_FS_NAME);
+
+	if (mnt)
+		return mnt;
+
+	mnt = strdup(MNT_CGRP2_PATH);
+	if (!mnt) {
+		fprintf(stderr, "Failed to allocate memory for cgroup2 path\n");
+		return NULL;
+
+	}
+
+	if (make_path(mnt, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		free(mnt);
+		return NULL;
+	}
+
+	if (mount("none", mnt, CGROUP2_FS_NAME, 0, NULL)) {
+		/* EBUSY means already mounted */
+		if (errno != EBUSY) {
+			fprintf(stderr,
+				"Failed to mount cgroup2. Are CGROUPS enabled in your kernel?\n");
+			free(mnt);
+			return NULL;
+		}
+	}
+	return mnt;
+}
+
+int make_path(const char *path, mode_t mode)
+{
+	char *dir, *delim;
+	struct stat sbuf;
+	int rc = -1;
+
+	delim = dir = strdup(path);
+	if (dir == NULL) {
+		fprintf(stderr, "strdup failed copying path");
+		return -1;
+	}
+
+	/* skip '/' -- it had better exist */
+	if (*delim == '/')
+		delim++;
+
+	while (1) {
+		delim = strchr(delim, '/');
+		if (delim)
+			*delim = '\0';
+
+		if (stat(dir, &sbuf) != 0) {
+			if (errno != ENOENT) {
+				fprintf(stderr,
+					"stat failed for %s: %s\n",
+					dir, strerror(errno));
+				goto out;
+			}
+
+			if (mkdir(dir, mode) != 0) {
+				fprintf(stderr,
+					"mkdir failed for %s: %s",
+					dir, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (delim == NULL)
+			break;
+
+		*delim = '/';
+		delim++;
+		if (*delim == '\0')
+			break;
+	}
+	rc = 0;
+out:
+	free(dir);
+
+	return rc;
+}
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 6/8] change name_is_vrf to return index
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

index of 0 means name is not a valid vrf.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/ip_common.h  |  2 +-
 ip/iplink_vrf.c | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 0147f45a7a31..3162f1ca5b2c 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -91,7 +91,7 @@ struct link_util *get_link_kind(const char *kind);
 void br_dump_bridge_id(const struct ifla_bridge_id *id, char *buf, size_t len);
 
 __u32 ipvrf_get_table(const char *name);
-bool name_is_vrf(const char *name);
+int name_is_vrf(const char *name);
 
 #ifndef	INFINITY_LIFE_TIME
 #define     INFINITY_LIFE_TIME      0xFFFFFFFFU
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index a238b2906805..c101ed770f87 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -159,7 +159,7 @@ __u32 ipvrf_get_table(const char *name)
 	return tb_id;
 }
 
-bool name_is_vrf(const char *name)
+int name_is_vrf(const char *name)
 {
 	struct {
 		struct nlmsghdr		n;
@@ -187,24 +187,27 @@ bool name_is_vrf(const char *name)
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
 	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return false;
+		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	if (len < 0) {
 		fprintf(stderr, "BUG: Invalid response to link query.\n");
-		return false;
+		return 0;
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
 
 	if (!tb[IFLA_LINKINFO])
-		return false;
+		return 0;
 
 	parse_rtattr_nested(li, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
 
 	if (!li[IFLA_INFO_KIND])
-		return false;
+		return 0;
+
+	if (strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf"))
+		return 0;
 
-	return strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf") == 0;
+	return ifi->ifi_index;
 }
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 4/8] move cmd_exec to lib utils
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/utils.h |  2 ++
 ip/ipnetns.c    | 34 ----------------------------------
 lib/Makefile    |  2 +-
 lib/exec.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 35 deletions(-)
 create mode 100644 lib/exec.c

diff --git a/include/utils.h b/include/utils.h
index 26c970daa5d0..ac4517a3bde1 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -256,4 +256,6 @@ char *int_to_str(int val, char *buf);
 int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
 
+int cmd_exec(const char *cmd, char **argv, bool do_fork);
+
 #endif /* __UTILS_H__ */
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index bd1e9013706c..db9a541769f1 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -357,40 +357,6 @@ static int netns_list(int argc, char **argv)
 	return 0;
 }
 
-static int cmd_exec(const char *cmd, char **argv, bool do_fork)
-{
-	fflush(stdout);
-	if (do_fork) {
-		int status;
-		pid_t pid;
-
-		pid = fork();
-		if (pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-
-		if (pid != 0) {
-			/* Parent  */
-			if (waitpid(pid, &status, 0) < 0) {
-				perror("waitpid");
-				exit(1);
-			}
-
-			if (WIFEXITED(status)) {
-				return WEXITSTATUS(status);
-			}
-
-			exit(1);
-		}
-	}
-
-	if (execvp(cmd, argv)  < 0)
-		fprintf(stderr, "exec of \"%s\" failed: %s\n",
-				cmd, strerror(errno));
-	_exit(1);
-}
-
 static int on_netns_exec(char *nsname, void *arg)
 {
 	char **argv = arg;
diff --git a/lib/Makefile b/lib/Makefile
index 5b7ec169048a..749073261c49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o \
-	names.o color.o bpf.o
+	names.o color.o bpf.o exec.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
 
diff --git a/lib/exec.c b/lib/exec.c
new file mode 100644
index 000000000000..96edbc422e84
--- /dev/null
+++ b/lib/exec.c
@@ -0,0 +1,41 @@
+#define _ATFILE_SOURCE
+#include <sys/wait.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+int cmd_exec(const char *cmd, char **argv, bool do_fork)
+{
+	fflush(stdout);
+	if (do_fork) {
+		int status;
+		pid_t pid;
+
+		pid = fork();
+		if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+
+		if (pid != 0) {
+			/* Parent  */
+			if (waitpid(pid, &status, 0) < 0) {
+				perror("waitpid");
+				exit(1);
+			}
+
+			if (WIFEXITED(status)) {
+				return WEXITSTATUS(status);
+			}
+
+			exit(1);
+		}
+	}
+
+	if (execvp(cmd, argv)  < 0)
+		fprintf(stderr, "exec of \"%s\" failed: %s\n",
+				cmd, strerror(errno));
+	_exit(1);
+}
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 2/8] bpf: export bpf_prog_load
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/bpf_util.h |  3 +++
 lib/bpf.c          | 40 ++++++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 49b96bbc208f..dcbdca6978d6 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -75,6 +75,9 @@ int bpf_trace_pipe(void);
 
 void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len);
 
+int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		  size_t size_insns, const char *license, char *log,
+		  size_t size_log);
 int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type);
 
diff --git a/lib/bpf.c b/lib/bpf.c
index 103fc1ef0593..b04c3a678b9c 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -871,6 +871,26 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 	return bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
 }
 
+int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		  size_t size_insns, const char *license, char *log,
+		  size_t size_log)
+{
+	union bpf_attr attr = {};
+
+	attr.prog_type = type;
+	attr.insns = bpf_ptr_to_u64(insns);
+	attr.insn_cnt = size_insns / sizeof(struct bpf_insn);
+	attr.license = bpf_ptr_to_u64(license);
+
+	if (size_log > 0) {
+		attr.log_buf = bpf_ptr_to_u64(log);
+		attr.log_size = size_log;
+		attr.log_level = 1;
+	}
+
+	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
 #ifdef HAVE_ELF
 struct bpf_elf_prog {
 	enum bpf_prog_type	type;
@@ -988,26 +1008,6 @@ static int bpf_map_create(enum bpf_map_type type, uint32_t size_key,
 	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
 
-static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
-			 size_t size_insns, const char *license, char *log,
-			 size_t size_log)
-{
-	union bpf_attr attr = {};
-
-	attr.prog_type = type;
-	attr.insns = bpf_ptr_to_u64(insns);
-	attr.insn_cnt = size_insns / sizeof(struct bpf_insn);
-	attr.license = bpf_ptr_to_u64(license);
-
-	if (size_log > 0) {
-		attr.log_buf = bpf_ptr_to_u64(log);
-		attr.log_size = size_log;
-		attr.log_level = 1;
-	}
-
-	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
-}
-
 static int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr = {};
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 1/8] lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481392069-3138-1-git-send-email-dsa@cumulusnetworks.com>

For consistency with other bpf commands, the functions are named
bpf_prog_attach and bpf_prog_detach. The existing bpf_prog_attach is
renamed to bpf_prog_load_and_report since it calls bpf_prog_load and
bpf_prog_report.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/bpf_util.h |  3 +++
 lib/bpf.c          | 31 ++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 05baeecda57f..49b96bbc208f 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -75,6 +75,9 @@ int bpf_trace_pipe(void);
 
 void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len);
 
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type);
+
 #ifdef HAVE_ELF
 int bpf_send_map_fds(const char *path, const char *obj);
 int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
diff --git a/lib/bpf.c b/lib/bpf.c
index 2a8cd51d4dae..103fc1ef0593 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -850,6 +850,27 @@ int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv)
 	return ret;
 }
 
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_bpf_fd = prog_fd,
+		.attach_type = type,
+	};
+
+	return bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr = {
+		.target_fd = target_fd,
+		.attach_type = type,
+	};
+
+	return bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
 #ifdef HAVE_ELF
 struct bpf_elf_prog {
 	enum bpf_prog_type	type;
@@ -1262,9 +1283,9 @@ static void bpf_prog_report(int fd, const char *section,
 	bpf_dump_error(ctx, "Verifier analysis:\n\n");
 }
 
-static int bpf_prog_attach(const char *section,
-			   const struct bpf_elf_prog *prog,
-			   struct bpf_elf_ctx *ctx)
+static int bpf_prog_load_and_report(const char *section,
+				    const struct bpf_elf_prog *prog,
+				    struct bpf_elf_ctx *ctx)
 {
 	int tries = 0, fd;
 retry:
@@ -1656,7 +1677,7 @@ static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section,
 		prog.size    = data.sec_data->d_size;
 		prog.license = ctx->license;
 
-		fd = bpf_prog_attach(section, &prog, ctx);
+		fd = bpf_prog_load_and_report(section, &prog, ctx);
 		if (fd < 0)
 			return fd;
 
@@ -1755,7 +1776,7 @@ static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section,
 		prog.size    = data_insn.sec_data->d_size;
 		prog.license = ctx->license;
 
-		fd = bpf_prog_attach(section, &prog, ctx);
+		fd = bpf_prog_load_and_report(section, &prog, ctx);
 		if (fd < 0) {
 			*lderr = true;
 			return fd;
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 0/8] Add support for vrf helper
From: David Ahern @ 2016-12-10 17:47 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern

This series adds support to iproute2 to run a command against a specific
VRF. The user semnatics are similar to 'ip netns'.

The 'ip vrf' subcommand supports 3 usages:

1. Run a command against a given vrf:
       ip vrf exec NAME CMD

   Uses the recently committed cgroup/sock BPF option. vrf directory
   is added to cgroup2 mount. Individual vrfs are created under it. BPF
   filter is attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the
   device index of the VRF. From there the current process (ip's pid) is
   addded to the cgroups.proc file and the given command is exected. In
   doing so all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically
   bound to the VRF domain.

   The association is inherited parent to child allowing the command to
   be a shell from which other commands are run relative to the VRF.

2. Show the VRF a process is bound to:
       ip vrf id
   This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
   entry.

3. Show process ids bound to a VRF
       ip vrf pids NAME
   This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
   shows the process ids in the particular vrf cgroup.

David Ahern (8):
  lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
  bpf: export bpf_prog_load
  Add libbpf.h header with BPF_XXXX macros
  move cmd_exec to lib utils
  Add filesystem APIs to lib
  change name_is_vrf to return index
  libnetlink: Add variant of rtnl_talk that does not display RTNETLINK
    answers error
  Introduce ip vrf command

 include/bpf_util.h   |   6 ++
 include/libbpf.h     | 184 ++++++++++++++++++++++++++++++++
 include/libnetlink.h |   3 +
 include/utils.h      |   4 +
 ip/Makefile          |   3 +-
 ip/ip.c              |   4 +-
 ip/ip_common.h       |   4 +-
 ip/iplink_vrf.c      |  29 ++++--
 ip/ipnetns.c         |  34 ------
 ip/ipvrf.c           | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile         |   2 +-
 lib/bpf.c            |  71 ++++++++-----
 lib/exec.c           |  41 ++++++++
 lib/fs.c             | 143 +++++++++++++++++++++++++
 lib/libnetlink.c     |  20 +++-
 man/man8/ip-vrf.8    |  88 ++++++++++++++++
 16 files changed, 850 insertions(+), 75 deletions(-)
 create mode 100644 include/libbpf.h
 create mode 100644 ip/ipvrf.c
 create mode 100644 lib/exec.c
 create mode 100644 lib/fs.c
 create mode 100644 man/man8/ip-vrf.8

-- 
2.1.4

^ permalink raw reply

* Re: [PATCH/RFC net-next] net: fec: allow "mini jumbo" frames
From: Vivien Didelot @ 2016-12-10 16:55 UTC (permalink / raw)
  To: Nikita Yushchenko, Fugang Duan, David S. Miller, Troy Kisky,
	Florian Fainelli, Andrew Lunn, Eric Nelson, Philippe Reynes,
	Johannes Berg, netdev
  Cc: Chris Healy, Fabio Estevam, linux-kernel, Nikita Yushchenko
In-Reply-To: <1481275255-7650-1-git-send-email-nikita.yoush@cogentembedded.com>

Hi Nikita,

Nikita Yushchenko <nikita.yoush@cogentembedded.com> writes:

> This adds support for MTU slightly larger than default, on modern
> FEC flavours.
>
> Currently FEC driver uses single hardware Rx buffer per frame. On most
> FEC flavours, size of single buffer is limited by 11-bit field, and
> has to be multiple of 64 (in the worst case). Thus maximum usable Rx
> buffer size is 1984 bytes.
>
> Of those:
> - 2 bytes are used for IP header alignment,
> - 14 bytes are used by ethhdr,
> - up to 8 bytes are needed for VLAN and/or DSA tags,
> - 4 bytes are needed for CRC.
>
> Thus maximum MTU possible within current RX architecture is 1956.
>
> This patch allows exactly that. For further increase, Rx architecture
> change is needed.
>
> Use of MTU=1956 gives about 1.5% throughput improvement between two Vybrid
> boards, compared to default MTU=1500.
>
> Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>

For what it's worth, I have tested your patch on my ZII Rev B boards
(see vf610-zii-dev-rev-b.dts) which have a FEC as the master net device
of their DSA trees. They still work as expected.

Tested-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Thanks,

        Vivien

^ permalink raw reply

* Re: [PATCH net-next] netfilter: nft_counter: rework atomic dump and reset
From: Eric Dumazet @ 2016-12-10 15:40 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, arnd, netdev
In-Reply-To: <20161210142501.GA25221@salvia>

On Sat, 2016-12-10 at 15:25 +0100, Pablo Neira Ayuso wrote:
> On Sat, Dec 10, 2016 at 03:16:55PM +0100, Pablo Neira Ayuso wrote:
=
>  
> -       nft_counter_fetch(priv, &total, reset);
> +       nft_counter_fetch(priv, &total);
> +       if (reset)
> +               nft_counter_reset(priv, &total);
>  
>         if (nla_put_be64(skb, NFTA_COUNTER_BYTES,
> cpu_to_be64(total.bytes),
>                          NFTA_COUNTER_PAD) ||

Night be nitpicking, but you might reset the stats only if the
nla_put_be64() succeeded.

But regardless of this detail, patch looks good and is very close to the
one I cooked and was about to send this morning.

Thanks Pablo !

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox