Netdev List

Netdev List
 help / color / mirror / Atom feed

* [iproute2 net-next 5/8] Add filesystem APIs to lib
From: David Ahern @ 2016-12-12  0:53 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481503995-24825-1-git-send-email-dsa@cumulusnetworks.com>

Add make_path to recursively call mkdir as needed to create a given
path with the given mode.

Add find_cgroup2_mount to lookup path where cgroup2 is mounted. If it
is not already mounted, cgroup2 is mounted under /var/run/cgroup2 for
use by iproute2.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/utils.h |   2 +
 lib/Makefile    |   2 +-
 lib/fs.c        | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 lib/fs.c

diff --git a/include/utils.h b/include/utils.h
index ac4517a3bde1..dc1d6b9607dd 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -257,5 +257,7 @@ int get_guid(__u64 *guid, const char *arg);
 int get_real_family(int rtm_type, int rtm_family);
 
 int cmd_exec(const char *cmd, char **argv, bool do_fork);
+int make_path(const char *path, mode_t mode);
+char *find_cgroup2_mount(void);
 
 #endif /* __UTILS_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index 749073261c49..0c57662b4f8f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o \
-	names.o color.o bpf.o exec.o
+	names.o color.o bpf.o exec.o fs.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
 
diff --git a/lib/fs.c b/lib/fs.c
new file mode 100644
index 000000000000..39cc96dccca9
--- /dev/null
+++ b/lib/fs.c
@@ -0,0 +1,143 @@
+/*
+ * fs.c         filesystem APIs
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "utils.h"
+
+#define CGROUP2_FS_NAME "cgroup2"
+
+/* if not already mounted cgroup2 is mounted here for iproute2's use */
+#define MNT_CGRP2_PATH  "/var/run/cgroup2"
+
+/* return mount path of first occurrence of given fstype */
+static char *find_fs_mount(const char *fs_to_find)
+{
+	char path[4096];
+	char fstype[128];    /* max length of any filesystem name */
+	char *mnt = NULL;
+	FILE *fp;
+
+	fp = fopen("/proc/mounts", "r");
+	if (!fp) {
+		fprintf(stderr,
+			"Failed to open mounts file: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	while (fscanf(fp, "%*s %4096s %127s %*s %*d %*d\n",
+		      path, fstype) == 2) {
+		if (strcmp(fstype, fs_to_find) == 0) {
+			mnt = strdup(path);
+			break;
+		}
+	}
+
+	fclose(fp);
+
+	return mnt;
+}
+
+/* caller needs to free string returned */
+char *find_cgroup2_mount(void)
+{
+	char *mnt = find_fs_mount(CGROUP2_FS_NAME);
+
+	if (mnt)
+		return mnt;
+
+	mnt = strdup(MNT_CGRP2_PATH);
+	if (!mnt) {
+		fprintf(stderr, "Failed to allocate memory for cgroup2 path\n");
+		return NULL;
+
+	}
+
+	if (make_path(mnt, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		free(mnt);
+		return NULL;
+	}
+
+	if (mount("none", mnt, CGROUP2_FS_NAME, 0, NULL)) {
+		/* EBUSY means already mounted */
+		if (errno != EBUSY) {
+			fprintf(stderr,
+				"Failed to mount cgroup2. Are CGROUPS enabled in your kernel?\n");
+			free(mnt);
+			return NULL;
+		}
+	}
+	return mnt;
+}
+
+int make_path(const char *path, mode_t mode)
+{
+	char *dir, *delim;
+	struct stat sbuf;
+	int rc = -1;
+
+	delim = dir = strdup(path);
+	if (dir == NULL) {
+		fprintf(stderr, "strdup failed copying path");
+		return -1;
+	}
+
+	/* skip '/' -- it had better exist */
+	if (*delim == '/')
+		delim++;
+
+	while (1) {
+		delim = strchr(delim, '/');
+		if (delim)
+			*delim = '\0';
+
+		if (stat(dir, &sbuf) != 0) {
+			if (errno != ENOENT) {
+				fprintf(stderr,
+					"stat failed for %s: %s\n",
+					dir, strerror(errno));
+				goto out;
+			}
+
+			if (mkdir(dir, mode) != 0) {
+				fprintf(stderr,
+					"mkdir failed for %s: %s",
+					dir, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (delim == NULL)
+			break;
+
+		*delim = '/';
+		delim++;
+		if (*delim == '\0')
+			break;
+	}
+	rc = 0;
+out:
+	free(dir);
+
+	return rc;
+}
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 6/8] change name_is_vrf to return index
From: David Ahern @ 2016-12-12  0:53 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481503995-24825-1-git-send-email-dsa@cumulusnetworks.com>

index of 0 means name is not a valid vrf.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/ip_common.h  |  2 +-
 ip/iplink_vrf.c | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 0147f45a7a31..3162f1ca5b2c 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -91,7 +91,7 @@ struct link_util *get_link_kind(const char *kind);
 void br_dump_bridge_id(const struct ifla_bridge_id *id, char *buf, size_t len);
 
 __u32 ipvrf_get_table(const char *name);
-bool name_is_vrf(const char *name);
+int name_is_vrf(const char *name);
 
 #ifndef	INFINITY_LIFE_TIME
 #define     INFINITY_LIFE_TIME      0xFFFFFFFFU
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index a238b2906805..c101ed770f87 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -159,7 +159,7 @@ __u32 ipvrf_get_table(const char *name)
 	return tb_id;
 }
 
-bool name_is_vrf(const char *name)
+int name_is_vrf(const char *name)
 {
 	struct {
 		struct nlmsghdr		n;
@@ -187,24 +187,27 @@ bool name_is_vrf(const char *name)
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
 	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return false;
+		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	if (len < 0) {
 		fprintf(stderr, "BUG: Invalid response to link query.\n");
-		return false;
+		return 0;
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
 
 	if (!tb[IFLA_LINKINFO])
-		return false;
+		return 0;
 
 	parse_rtattr_nested(li, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
 
 	if (!li[IFLA_INFO_KIND])
-		return false;
+		return 0;
+
+	if (strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf"))
+		return 0;
 
-	return strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf") == 0;
+	return ifi->ifi_index;
 }
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 7/8] libnetlink: Add variant of rtnl_talk that does not display RTNETLINK answers error
From: David Ahern @ 2016-12-12  0:53 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481503995-24825-1-git-send-email-dsa@cumulusnetworks.com>

iplink_vrf has 2 functions used to validate a user given device name is
a VRF device and to return the table id. If the user string is not a
device name ip commands with a vrf keyword show a confusing error
message: "RTNETLINK answers: No such device".

Add a variant of rtnl_talk that does not display the "RTNETLINK answers"
message and update iplink_vrf to use it.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/libnetlink.h |  3 +++
 ip/iplink_vrf.c      | 14 +++++++++++---
 lib/libnetlink.c     | 20 +++++++++++++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index 751ebf186dd4..bd0267dfcc02 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -81,6 +81,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	      struct nlmsghdr *answer, size_t len)
 	__attribute__((warn_unused_result));
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t len)
+	__attribute__((warn_unused_result));
 int rtnl_send(struct rtnl_handle *rth, const void *buf, int)
 	__attribute__((warn_unused_result));
 int rtnl_send_check(struct rtnl_handle *rth, const void *buf, int)
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index c101ed770f87..917630e85337 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -13,6 +13,7 @@
 #include <string.h>
 #include <sys/socket.h>
 #include <linux/if_link.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -126,8 +127,14 @@ __u32 ipvrf_get_table(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return 0;
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0) {
+		/* special case "default" vrf to be the main table */
+		if (errno == ENODEV && !strcmp(name, "default"))
+			rtnl_rttable_a2n(&tb_id, "main");
+
+		return tb_id;
+	}
 
 	ifi = NLMSG_DATA(&answer.n);
 	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
@@ -186,7 +193,8 @@ int name_is_vrf(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
+					   &answer.n, sizeof(answer)) < 0)
 		return 0;
 
 	ifi = NLMSG_DATA(&answer.n);
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index a5db168e50eb..9d7e89aebbd0 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -12,6 +12,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <unistd.h>
 #include <syslog.h>
 #include <fcntl.h>
@@ -397,8 +398,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 	return rtnl_dump_filter_l(rth, a);
 }
 
-int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t maxlen)
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+		       struct nlmsghdr *answer, size_t maxlen,
+		       bool show_rtnl_err)
 {
 	int status;
 	unsigned int seq;
@@ -485,7 +487,7 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					return 0;
 				}
 
-				if (rtnl->proto != NETLINK_SOCK_DIAG)
+				if (rtnl->proto != NETLINK_SOCK_DIAG && show_rtnl_err)
 					fprintf(stderr,
 						"RTNETLINK answers: %s\n",
 						strerror(-err->error));
@@ -517,6 +519,18 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 	}
 }
 
+int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+	      struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, true);
+}
+
+int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+				   struct nlmsghdr *answer, size_t maxlen)
+{
+	return __rtnl_talk(rtnl, n, answer, maxlen, false);
+}
+
 int rtnl_listen_all_nsid(struct rtnl_handle *rth)
 {
 	unsigned int on = 1;
-- 
2.1.4

^ permalink raw reply related

* [iproute2 net-next 8/8] Introduce ip vrf command
From: David Ahern @ 2016-12-12  0:53 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern
In-Reply-To: <1481503995-24825-1-git-send-email-dsa@cumulusnetworks.com>

'ip vrf' follows the user semnatics established by 'ip netns'.

The 'ip vrf' subcommand supports 3 usages:

1. Run a command against a given vrf:
       ip vrf exec NAME CMD

   Uses the recently committed cgroup/sock BPF option. vrf directory
   is added to cgroup2 mount. Individual vrfs are created under it. BPF
   filter attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the VRF
   device index. From there the current process (ip's pid) is addded to
   the cgroups.proc file and the given command is exected. In doing so
   all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically bound to
   the VRF domain.

   The association is inherited parent to child allowing the command to
   be a shell from which other commands are run relative to the VRF.

2. Show the VRF a process is bound to:
       ip vrf id
   This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
   entry with the VRF name following.

3. Show process ids bound to a VRF
       ip vrf pids NAME
   This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
   shows the process ids in the particular vrf cgroup.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 ip/Makefile       |   3 +-
 ip/ip.c           |   4 +-
 ip/ip_common.h    |   2 +
 ip/ipvrf.c        | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man8/ip-vrf.8 |  88 +++++++++++++++++
 5 files changed, 384 insertions(+), 2 deletions(-)
 create mode 100644 ip/ipvrf.c
 create mode 100644 man/man8/ip-vrf.8

diff --git a/ip/Makefile b/ip/Makefile
index c8e6c6172741..1928489e7f90 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -7,7 +7,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
     iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
-    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o
+    iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \
+    ipvrf.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index cb3adcb3f57d..07050b07592a 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -51,7 +51,8 @@ static void usage(void)
 "       ip [ -force ] -batch filename\n"
 "where  OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n"
-"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila }\n"
+"                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila |\n"
+"                   vrf }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -h[uman-readable] | -iec |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n"
@@ -99,6 +100,7 @@ static const struct cmd {
 	{ "mrule",	do_multirule },
 	{ "netns",	do_netns },
 	{ "netconf",	do_ipnetconf },
+	{ "vrf",	do_ipvrf},
 	{ "help",	do_help },
 	{ 0 }
 };
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 3162f1ca5b2c..28763e81e4a4 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -57,6 +57,8 @@ extern int do_ipila(int argc, char **argv);
 int do_tcp_metrics(int argc, char **argv);
 int do_ipnetconf(int argc, char **argv);
 int do_iptoken(int argc, char **argv);
+int do_ipvrf(int argc, char **argv);
+
 int iplink_get(unsigned int flags, char *name, __u32 filt_mask);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
diff --git a/ip/ipvrf.c b/ip/ipvrf.c
new file mode 100644
index 000000000000..d49af774438e
--- /dev/null
+++ b/ip/ipvrf.c
@@ -0,0 +1,289 @@
+/*
+ * ipvrf.c	"ip vrf"
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	David Ahern <dsa@cumulusnetworks.com>
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <linux/bpf.h>
+#include <linux/if.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+#include "bpf_util.h"
+
+#define CGRP_PROC_FILE  "/cgroup.procs"
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip vrf exec [NAME] cmd ...\n");
+	fprintf(stderr, "       ip vrf identify [PID]\n");
+	fprintf(stderr, "       ip vrf pids [NAME]\n");
+
+	exit(-1);
+}
+
+static int ipvrf_identify(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *vrf, *end;
+	int fd, rc = -1;
+	unsigned int pid;
+	ssize_t n;
+
+	if (argc < 1)
+		pid = getpid();
+	else if (argc > 1)
+		invarg("Extra arguments specified\n", argv[1]);
+	else if (get_unsigned(&pid, argv[0], 10))
+		invarg("Invalid pid\n", argv[0]);
+
+	snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr,
+			"Failed to open cgroups file: %s\n", strerror(errno));
+		return -1;
+	}
+
+	n = read(fd, buf, sizeof(buf) - 1);
+	if (n < 0) {
+		fprintf(stderr,
+			"Failed to read cgroups file: %s\n", strerror(errno));
+		goto out;
+	}
+	buf[n] = '\0';
+	vrf = strstr(buf, "::/vrf/");
+	if (vrf) {
+		vrf += 7;  /* skip past "::/vrf/" */
+		end = strchr(vrf, '\n');
+		if (end)
+			*end = '\0';
+
+		printf("%s\n", vrf);
+	}
+
+	rc = 0;
+out:
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_pids(int argc, char **argv)
+{
+	char path[PATH_MAX];
+	char buf[4096];
+	char *mnt, *vrf;
+	int fd, rc = -1;
+	ssize_t n;
+
+	if (argc != 1) {
+		fprintf(stderr, "Invalid arguments\n");
+		return -1;
+	}
+
+	vrf = argv[0];
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/vrf/%s%s", mnt, vrf, CGRP_PROC_FILE);
+	free(mnt);
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return 0; /* no cgroup file, nothing to show */
+
+	while (1) {
+		n = read(fd, buf, sizeof(buf) - 1);
+		if (n < 0) {
+			fprintf(stderr,
+				"Failed to read cgroups file: %s\n", strerror(errno));
+			break;
+		} else if (n == 0) {
+			rc = 0;
+			break;
+		}
+		printf("%s", buf);
+	}
+
+	close(fd);
+
+	return rc;
+}
+
+/* load BPF program to set sk_bound_dev_if for sockets */
+static char bpf_log_buf[256*1024];
+
+static int prog_load(int idx)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_MOV64_IMM(BPF_REG_3, idx),
+		BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+
+	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			     "GPL", bpf_log_buf, sizeof(bpf_log_buf));
+}
+
+static int vrf_configure_cgroup(const char *path, int ifindex)
+{
+	int rc = -1, cg_fd, prog_fd = -1;
+
+	cg_fd = open(path, O_DIRECTORY | O_RDONLY);
+	if (cg_fd < 0) {
+		fprintf(stderr, "Failed to open cgroup path: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	/*
+	 * Load bpf program into kernel and attach to cgroup to affect
+	 * socket creates
+	 */
+	prog_fd = prog_load(ifindex);
+	if (prog_fd < 0) {
+		printf("Failed to load BPF prog: '%s'\n", strerror(errno));
+		goto out;
+	}
+
+	if (bpf_prog_attach_fd(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE)) {
+		fprintf(stderr, "Failed to attach prog to cgroup: '%s'\n",
+			strerror(errno));
+			fprintf(stderr, "Kernel compiled with CGROUP_BPF enabled?\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	close(cg_fd);
+	close(prog_fd);
+
+	return rc;
+}
+
+static int vrf_switch(const char *name)
+{
+	char path[PATH_MAX], *mnt, pid[16];
+	int ifindex = name_is_vrf(name);
+	bool default_vrf = false;
+	int rc = -1, len, fd = -1;
+
+	if (!ifindex) {
+		if (strcmp(name, "default")) {
+			fprintf(stderr, "Invalid VRF name\n");
+			return -1;
+		}
+		default_vrf = true;
+	}
+
+	mnt = find_cgroup2_mount();
+	if (!mnt)
+		return -1;
+
+	/* path to cgroup; make sure buffer has room to cat "/cgroup.procs"
+	 * to the end of the path
+	 */
+	len = snprintf(path, sizeof(path) - sizeof(CGRP_PROC_FILE), "%s%s/%s",
+		       mnt, default_vrf ? "" : "/vrf", name);
+	if (len > sizeof(path) - sizeof(CGRP_PROC_FILE)) {
+		fprintf(stderr, "Invalid path to cgroup2 mount\n");
+		goto out;
+	}
+
+	if (make_path(path, 0755)) {
+		fprintf(stderr, "Failed to setup vrf cgroup2 directory\n");
+		goto out;
+	}
+
+	if (!default_vrf && vrf_configure_cgroup(path, ifindex))
+		goto out;
+
+	/*
+	 * write pid to cgroup.procs making process part of cgroup
+	 */
+	strcat(path, CGRP_PROC_FILE);
+	fd = open(path, O_RDWR | O_APPEND);
+	if (fd < 0) {
+		fprintf(stderr, "Failed to open cgroups.procs file: %s.\n",
+			strerror(errno));
+		goto out;
+	}
+
+	snprintf(pid, sizeof(pid), "%d", getpid());
+	if (write(fd, pid, strlen(pid)) < 0) {
+		fprintf(stderr, "Failed to join cgroup\n");
+		goto out;
+	}
+
+	rc = 0;
+out:
+	free(mnt);
+	close(fd);
+
+	return rc;
+}
+
+static int ipvrf_exec(int argc, char **argv)
+{
+	if (argc < 1) {
+		fprintf(stderr, "No VRF name specified\n");
+		return -1;
+	}
+	if (argc < 2) {
+		fprintf(stderr, "No command specified\n");
+		return -1;
+	}
+
+	if (vrf_switch(argv[0]))
+		return -1;
+
+	return -cmd_exec(argv[1], argv + 1, !!batch_mode);
+}
+
+int do_ipvrf(int argc, char **argv)
+{
+	if (argc == 0) {
+		fprintf(stderr, "No command given. Try \"ip vrf help\".\n");
+		exit(-1);
+	}
+
+	if (matches(*argv, "identify") == 0)
+		return ipvrf_identify(argc-1, argv+1);
+
+	if (matches(*argv, "pids") == 0)
+		return ipvrf_pids(argc-1, argv+1);
+
+	if (matches(*argv, "exec") == 0)
+		return ipvrf_exec(argc-1, argv+1);
+
+	if (matches(*argv, "help") == 0)
+		usage();
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"ip vrf help\".\n",
+		*argv);
+
+	exit(-1);
+}
diff --git a/man/man8/ip-vrf.8 b/man/man8/ip-vrf.8
new file mode 100644
index 000000000000..57a7c7692ce8
--- /dev/null
+++ b/man/man8/ip-vrf.8
@@ -0,0 +1,88 @@
+.TH IP\-VRF 8 "7 Dec 2016" "iproute2" "Linux"
+.SH NAME
+ip-vrf \- run a command against a vrf
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B ip
+.B vrf
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.BR "ip vrf identify"
+.RI "[ " PID " ]"
+
+.ti -8
+.BR "ip vrf pids"
+.I NAME
+
+.ti -8
+.BR "ip vrf exec "
+.RI "[ " NAME " ] " command ...
+
+.SH DESCRIPTION
+A VRF provides traffic isolation at layer 3 for routing, similar to how a
+VLAN is used to isolate traffic at layer 2. Fundamentally, a VRF is a separate
+routing table. Network devices are associated with a VRF by enslaving the
+device to the VRF. At that point network addresses assigned to the device are
+local to the VRF with host and connected routes moved to the table associated
+with the VRF.
+
+A process can specify a VRF using several APIs -- binding the socket to the
+VRF device using SO_BINDTODEVICE, setting the VRF association using
+IP_UNICAST_IF or IPV6_UNICAST_IF, or specifying the VRF for a specific message
+using IP_PKTINFO or IPV6_PKTINFO.
+
+By default a process is not bound to any VRF. An association can be set
+explicitly by making the program use one of the APIs mentioned above or
+implicitly using a helper to set SO_BINDTODEVICE for all IPv4 and IPv6
+sockets (AF_INET and AF_INET6) when the socket is created. This ip-vrf command
+is a helper to run a command against a specific VRF with the VRF association
+inherited parent to child.
+
+.TP
+.B ip vrf exec [ NAME ] cmd ... - Run cmd against the named VRF
+.sp
+This command allows applications that are VRF unaware to be run against
+a VRF other than the default VRF (main table). A command can be run against
+the default VRF by passing the "default" as the VRF name. This is useful if
+the current shell is associated with another VRF (e.g, Management VRF).
+
+.TP
+.B ip vrf identify [PID] - Report VRF association for process
+.sp
+This command shows the VRF association of the specified process. If PID is
+not specified then the id of the current process is used.
+
+.TP
+.B ip vrf pids NAME - Report processes associated with the named VRF
+.sp
+This command shows all process ids that are associated with the given
+VRF.
+
+.SH CAVEATS
+This command requires a kernel compiled with CGROUPS and CGROUP_BPF enabled.
+
+The VRF helper *only* affects network layer sockets.
+
+.SH EXAMPLES
+.PP
+ip vrf exec red ssh 10.100.1.254
+.RS
+Executes ssh to 10.100.1.254 against the VRF red table.
+.RE
+
+.SH SEE ALSO
+.br
+.BR ip (8),
+.BR ip-link (8),
+.BR ip-address (8),
+.BR ip-route (8),
+.BR ip-neighbor (8)
+
+.SH AUTHOR
+Original Manpage by David Ahern
-- 
2.1.4

^ permalink raw reply related

* [PATCH v2] ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output
From: Zheng Li @ 2016-12-12  1:56 UTC (permalink / raw)
  To: linux-kernel, netdev, davem, kuznet, jmorris, yoshfuji, kaber; +Cc: james.z.li

From: zheng li <james.z.li@ericsson.com>

There is an inconsistent conditional judgement in __ip_append_data and
ip_finish_output functions, the variable length in __ip_append_data just
include the length of application's payload and udp header, don't include
the length of ip header, but in ip_finish_output use
(skb->len > ip_skb_dst_mtu(skb)) as judgement, and skb->len include the
length of ip header.

That causes some particular application's udp payload whose length is
between (MTU - IP Header) and MTU were fragmented by ip_fragment even
though the rst->dev support UFO feature.

Add the length of ip header to length in __ip_append_data to keep
consistent conditional judgement as ip_finish_output for ip fragment.

Signed-off-by: Zheng Li <james.z.li@ericsson.com>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 877bdb0..12a0149 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -936,7 +936,7 @@ static int __ip_append_data(struct sock *sk,
 		csummode = CHECKSUM_PARTIAL;

 	cork->length += length;
-	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+	if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
 	    (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 08/10] vsock/virtio: mark an internal function static
From: Jason Wang @ 2016-12-12  1:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Stefan Hajnoczi, David S. Miller, kvm,
	virtualization, netdev
In-Reply-To: <20161208162458-mutt-send-email-mst@kernel.org>



On 2016年12月08日 22:25, Michael S. Tsirkin wrote:
> On Wed, Dec 07, 2016 at 12:21:22PM +0800, Jason Wang wrote:
>>
>> On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
>>> virtio_transport_alloc_pkt is only used locally, make it static.
>>>
>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>> ---
>>>    net/vmw_vsock/virtio_transport_common.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index a53b3a1..6120384 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -32,7 +32,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>    	return container_of(t, struct virtio_transport, transport);
>>>    }
>>> -struct virtio_vsock_pkt *
>>> +static struct virtio_vsock_pkt *
>>>    virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>>>    			   size_t len,
>>>    			   u32 src_cid,
>> Git grep shows it was used by tracing.
> True but trace_virtio_transport_alloc_pkt is also local to
> virtio_transport_common.c
>

I see, so let's remove the EXPORT_SYMBOL_GPL() too?

^ permalink raw reply

* linux-next: manual merge of the tip tree with the net-next tree
From: Stephen Rothwell @ 2016-12-12  2:30 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Peter Zijlstra,
	David Miller, Networking
  Cc: linux-next, linux-kernel, Nicolas Pitre, WingMan Kwok

Hi all,

Today's linux-next merge of the tip tree got a conflict in:

  drivers/net/ethernet/ti/Kconfig

between commit:

  6246168b4a38 ("net: ethernet: ti: netcp: add support of cpts")

from the net-next tree and commit:

  d1cbfd771ce8 ("ptp_clock: Allow for it to be optional")

from the tip tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/ethernet/ti/Kconfig
index dc217fd7a734,61b835a7e6ae..000000000000
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@@ -74,14 -74,13 +74,14 @@@ config TI_CPS
  	  will be called cpsw.
  
  config TI_CPTS
 -	bool "TI Common Platform Time Sync (CPTS) Support"
 -	depends on TI_CPSW
 +	tristate "TI Common Platform Time Sync (CPTS) Support"
 +	depends on TI_CPSW || TI_KEYSTONE_NETCP
- 	select PTP_1588_CLOCK
+ 	imply PTP_1588_CLOCK
  	---help---
  	  This driver supports the Common Platform Time Sync unit of
 -	  the CPSW Ethernet Switch. The unit can time stamp PTP UDP/IPv4
 -	  and Layer 2 packets, and the driver offers a PTP Hardware Clock.
 +	  the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem.
 +	  The unit can time stamp PTP UDP/IPv4 and Layer 2 packets, and the
 +	  driver offers a PTP Hardware Clock.
  
  config TI_KEYSTONE_NETCP
  	tristate "TI Keystone NETCP Core Support"

^ permalink raw reply

* Re: [PATCH] net: wan: Use dma_pool_zalloc
From: Souptick Joarder @ 2016-12-12  4:42 UTC (permalink / raw)
  To: Krzysztof Hałasa, netdev; +Cc: Rameshwar Sahu
In-Reply-To: <m3wpf9l0fc.fsf@t19.piap.pl>

On Fri, Dec 9, 2016 at 6:33 PM, Krzysztof Hałasa <khalasa@piap.pl> wrote:
> Souptick Joarder <jrdr.linux@gmail.com> writes:
>
>> We should use dma_pool_zalloc instead of dma_pool_alloc/memset
>>
>> Signed-off-by: Souptick joarder <jrdr.linux@gmail.com>
>> ---
>>  drivers/net/wan/ixp4xx_hss.c | 5 ++---
>>  1 file changed, 2 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
>> index e7bbdb7..aaabf31 100644
>> --- a/drivers/net/wan/ixp4xx_hss.c
>> +++ b/drivers/net/wan/ixp4xx_hss.c
>> @@ -976,10 +976,9 @@ static int init_hdlc_queues(struct port *port)
>>                       return -ENOMEM;
>>       }
>>
>> -     if (!(port->desc_tab = dma_pool_alloc(dma_pool, GFP_KERNEL,
>> -                                           &port->desc_tab_phys)))
>> +     if (!(port->desc_tab = dma_pool_zalloc(dma_pool, GFP_KERNEL,
>> +                                            &port->desc_tab_phys)))
>>               return -ENOMEM;
>> -     memset(port->desc_tab, 0, POOL_ALLOC_SIZE);
>>       memset(port->rx_buff_tab, 0, sizeof(port->rx_buff_tab)); /* tables */
>>       memset(port->tx_buff_tab, 0, sizeof(port->tx_buff_tab));
>
> This look fine, feel free to send it to the netdev mailing list for
> inclusion.

Including netdev mailing list based as requested.

>
> Acked-by: Krzysztof Halasa <khalasa@piap.pl>
> --
> Krzysztof Halasa
>
> Industrial Research Institute for Automation and Measurements PIAP
> Al. Jerozolimskie 202, 02-486 Warsaw, Poland

^ permalink raw reply

* [PATCH] vhost: cache used event for better performance
From: Jason Wang @ 2016-12-12  6:46 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: Jason Wang

When event index was enabled, we need to fetch used event from
userspace memory each time. This userspace fetch (with memory
barrier) could be saved sometime when 1) caching used event and 2)
if used event is ahead of new and old to new updating does not cross
it, we're sure there's no need to notify guest.

This will be useful for heavy tx load e.g guest pktgen test with Linux
driver shows ~3.5% improvement.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 28 ++++++++++++++++++++++------
 drivers/vhost/vhost.h |  3 +++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2663543..d3fa550 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -290,6 +290,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->avail = NULL;
 	vq->used = NULL;
 	vq->last_avail_idx = 0;
+	vq->last_used_event = 0;
 	vq->avail_idx = 0;
 	vq->last_used_idx = 0;
 	vq->signalled_used = 0;
@@ -1324,7 +1325,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 			r = -EINVAL;
 			break;
 		}
-		vq->last_avail_idx = s.num;
+		vq->last_avail_idx = vq->last_used_event = s.num;
 		/* Forget the cached index value. */
 		vq->avail_idx = vq->last_avail_idx;
 		break;
@@ -2159,10 +2160,6 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
-	/* Flush out used index updates. This is paired
-	 * with the barrier that the Guest executes when enabling
-	 * interrupts. */
-	smp_mb();
 
 	if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
 	    unlikely(vq->avail_idx == vq->last_avail_idx))
@@ -2170,6 +2167,10 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 		__virtio16 flags;
+		/* Flush out used index updates. This is paired
+		 * with the barrier that the Guest executes when enabling
+		 * interrupts. */
+		smp_mb();
 		if (vhost_get_user(vq, flags, &vq->avail->flags)) {
 			vq_err(vq, "Failed to get flags");
 			return true;
@@ -2184,11 +2185,26 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	if (unlikely(!v))
 		return true;
 
+	/* We're sure if the following conditions are met, there's no
+	 * need to notify guest:
+	 * 1) cached used event is ahead of new
+	 * 2) old to new updating does not cross cached used event. */
+	if (vring_need_event(vq->last_used_event, new + vq->num, new) &&
+	    !vring_need_event(vq->last_used_event, new, old))
+		return false;
+
+	/* Flush out used index updates. This is paired
+	 * with the barrier that the Guest executes when enabling
+	 * interrupts. */
+	smp_mb();
+
 	if (vhost_get_user(vq, event, vhost_used_event(vq))) {
 		vq_err(vq, "Failed to get used event idx");
 		return true;
 	}
-	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
+	vq->last_used_event = vhost16_to_cpu(vq, event);
+
+	return vring_need_event(vq->last_used_event, new, old);
 }
 
 /* This actually signals the guest, using eventfd. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 78f3c5f..a9cbbb1 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -107,6 +107,9 @@ struct vhost_virtqueue {
 	/* Last index we used. */
 	u16 last_used_idx;
 
+	/* Last used evet we've seen */
+	u16 last_used_event;
+
 	/* Used flags */
 	u16 used_flags;
 
-- 
2.7.4


^ permalink raw reply related

* Re: [Patch net-next] ipvs: remove an annoying printk in netns init
From: Simon Horman @ 2016-12-12  7:54 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
In-Reply-To: <1481346600-25335-1-git-send-email-xiyou.wangcong@gmail.com>

On vr, dec 09, 2016 at 09:09:59 -0800, Cong Wang wrote:
> At most it is used for debugging purpose, but I don't think
> it is even useful for debugging, just remove it.
> 
> Cc: Simon Horman <horms@verge.net.au>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Thanks, applied.

^ permalink raw reply

* Re: [PATCH net-next 2/2] net: ethernet: Initial driver for Synopsys DWC XLGMAC
From: kbuild test robot @ 2016-12-12  8:26 UTC (permalink / raw)
  To: Jie Deng
  Cc: kbuild-all, davem, f.fainelli, netdev, linux-kernel,
	CARLOS.PALMINHA, lars.persson, thomas.lendacky, Jie Deng
In-Reply-To: <3fe82457c51f8437797eae27d03cdb0dcbef039b.1481075763.git.jiedeng@synopsys.com>

[-- Attachment #1: Type: text/plain, Size: 2632 bytes --]

Hi Jie,

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Jie-Deng/net-phy-add-extension-of-phy-mode-for-XLGMII/20161207-121843
config: i386-randconfig-h0-12121424 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from drivers/net/ethernet/synopsys/dwc/dwc-xlgmac-pci.c:17:0:
>> drivers/net/ethernet/synopsys/dwc/dwc-eth.h:662:26: error: 'IEEE_8021QAZ_MAX_TCS' undeclared here (not in a function)
     unsigned int prio2q_map[IEEE_8021QAZ_MAX_TCS];
                             ^~~~~~~~~~~~~~~~~~~~
--
   In file included from drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:58:0:
>> drivers/net/ethernet/synopsys/dwc/dwc-eth.h:662:26: error: 'IEEE_8021QAZ_MAX_TCS' undeclared here (not in a function)
     unsigned int prio2q_map[IEEE_8021QAZ_MAX_TCS];
                             ^~~~~~~~~~~~~~~~~~~~
   drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c: In function 'dwc_eth_enable_tx_flow_control':
>> drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:1214:13: error: dereferencing pointer to incomplete type 'struct ieee_ets'
        tc = ets->prio_tc[prio];
                ^~
>> drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:1217:12: error: dereferencing pointer to incomplete type 'struct ieee_pfc'
        if (pfc->pfc_en & (1 << tc)) {
               ^~
   drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c: In function 'dwc_eth_config_dcb_tc':
>> drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:1407:8: error: 'IEEE_8021QAZ_TSA_STRICT' undeclared (first use in this function)
      case IEEE_8021QAZ_TSA_STRICT:
           ^~~~~~~~~~~~~~~~~~~~~~~
   drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:1407:8: note: each undeclared identifier is reported only once for each function it appears in
>> drivers/net/ethernet/synopsys/dwc/dwc-eth-hw.c:1413:8: error: 'IEEE_8021QAZ_TSA_ETS' undeclared (first use in this function)
      case IEEE_8021QAZ_TSA_ETS:
           ^~~~~~~~~~~~~~~~~~~~

vim +/IEEE_8021QAZ_MAX_TCS +662 drivers/net/ethernet/synopsys/dwc/dwc-eth.h

   656		u64 tx_tstamp;
   657	
   658		/* DCB support */
   659		struct ieee_ets *ets;
   660		struct ieee_pfc *pfc;
   661		unsigned int q2tc_map[DWC_ETH_MAX_QUEUES];
 > 662		unsigned int prio2q_map[IEEE_8021QAZ_MAX_TCS];
   663		u8 num_tcs;
   664	
   665		/* Device control parameters */

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26168 bytes --]

^ permalink raw reply

* RE: [PATCH] net: add one ethtool option to set relax ordering mode
From: maowenan @ 2016-12-12  8:30 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev@vger.kernel.org, jeffrey.t.kirsher@intel.com,
	weiyongjun (A)
In-Reply-To: <20161208141153.GI26852@lunn.ch>



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Andrew Lunn
> Sent: Thursday, December 08, 2016 10:12 PM
> To: maowenan
> Cc: netdev@vger.kernel.org; jeffrey.t.kirsher@intel.com
> Subject: Re: [PATCH] net: add one ethtool option to set relax ordering mode
> 
> On Thu, Dec 08, 2016 at 02:51:37PM +0800, Mao Wenan wrote:
> > This patch provides one way to set/unset IXGBE NIC TX and RX relax
> > ordering mode, which can be set by ethtool.
> > Relax ordering is one mode of 82599 NIC, to enable this mode can
> > enhance the performance for some cpu architecure.
> > example:
> > ethtool -s enp1s0f0 relaxorder off
> > ethtool -s enp1s0f0 relaxorder on
> 
> Since this is a simple on/off, could it not be done with a feature?
> ethtool --feature?
> 
> 	Andrew

Hello Andrew, 
	Thank you for your comments.
	I get your idea about using ethtool -K|--feature is good for this feature, right? 
My original concert is about this is a relax ordering mode exist in 82599, it is the hardware 
related feature. And ethtool -s option is related hardware of phy and other (e.g: speed, duplex...),
it is very easy to implement in do_sset().
But ethtool -K is mainly used for protocol offload,
        ethtool -K|--features|--offload DEVNAME Set protocol offload and other features
                FEATURE on|off ... 
@Jeff Kirsher, what's your comments?

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: Mike Rapoport @ 2016-12-12  8:38 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev@vger.kernel.org, linux-mm, John Fastabend,
	Willem de Bruijn, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Mel Gorman, Tom Herbert, Brenden Blanco,
	Tariq Toukan, Saeed Mahameed, Jesse Brandeburg, Kalman Meth
In-Reply-To: <20161205153132.283fcb0e@redhat.com>

Hello Jesper,

On Mon, Dec 05, 2016 at 03:31:32PM +0100, Jesper Dangaard Brouer wrote:
> Hi all,
> 
> This is my design for how to safely handle RX zero-copy in the network
> stack, by using page_pool[1] and modifying NIC drivers.  Safely means
> not leaking kernel info in pages mapped to userspace and resilience
> so a malicious userspace app cannot crash the kernel.
> 
> Design target
> =============
> 
> Allow the NIC to function as a normal Linux NIC and be shared in a
> safe manor, between the kernel network stack and an accelerated
> userspace application using RX zero-copy delivery.
> 
> Target is to provide the basis for building RX zero-copy solutions in
> a memory safe manor.  An efficient communication channel for userspace
> delivery is out of scope for this document, but OOM considerations are
> discussed below (`Userspace delivery and OOM`_).

Sorry, if this reply is a bit off-topic.

I'm working on implementation of RX zero-copy for virtio and I've dedicated
some thought about making guest memory available for physical NIC DMAs.
I believe this is quite related to your page_pool proposal, at least from
the NIC driver perspective, so I'd like to share some thoughts here.
The idea is to dedicate one (or more) of the NIC's queues to a VM, e.g.
using macvtap, and then propagate guest RX memory allocations to the NIC
using something like new .ndo_set_rx_buffers method.

What is your view about interface between the page_pool and the NIC
drivers?
Have you considered using "push" model for setting the NIC's RX memory?

> 
> --
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
> 
> Above document is taken at GitHub commit 47fa7c844f48fab8b
>  https://github.com/netoptimizer/prototype-kernel/commit/47fa7c844f48fab8b
> 

--
Sincerely yours,
Mike.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] net:phy fix driver reference count error when attach and detach phy device
From: maowenan @ 2016-12-12  8:49 UTC (permalink / raw)
  To: David Laight, netdev@vger.kernel.org, f.fainelli@gmail.com,
	dingtianhong@huawei.com, weiyongjun (A)
In-Reply-To: <0cef0798-7b0d-734d-17a3-bebffe6206c7@huawei.com>



On 2016/12/5 16:47, maowenan wrote:
> 
> 
> On 2016/12/2 17:45, David Laight wrote:
>> From: Mao Wenan
>>> Sent: 30 November 2016 10:23
>>> The nic in my board use the phy dev from marvell, and the system will
>>> load the marvell phy driver automatically, but when I remove the phy
>>> drivers, the system immediately panic:
>>> Call trace:
>>> [ 2582.834493] [<ffff800000715384>] phy_state_machine+0x3c/0x438 [
>>> 2582.851754] [<ffff8000000db3b8>] process_one_work+0x150/0x428 [
>>> 2582.868188] [<ffff8000000db7d4>] worker_thread+0x144/0x4b0 [
>>> 2582.883882] [<ffff8000000e1d0c>] kthread+0xfc/0x110
>>>
>>> there should be proper reference counting in place to avoid that.
>>> I found that phy_attach_direct() forgets to add phy device driver
>>> reference count, and phy_detach() forgets to subtract reference count.
>>> This patch is to fix this bug, after that panic is disappeared when remove
>>> marvell.ko
>>>
>>> Signed-off-by: Mao Wenan <maowenan@huawei.com>
>>> ---
>>>  drivers/net/phy/phy_device.c | 7 +++++++
>>>  1 file changed, 7 insertions(+)
>>>
>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>> index 1a4bf8a..a7ec7c2 100644
>>> --- a/drivers/net/phy/phy_device.c
>>> +++ b/drivers/net/phy/phy_device.c
>>> @@ -866,6 +866,11 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>>  		return -EIO;
>>>  	}
>>>
>>> +	if (!try_module_get(d->driver->owner)) {
>>> +		dev_err(&dev->dev, "failed to get the device driver module\n");
>>> +		return -EIO;
>>> +	}
>>
>> If this is the phy code, what stops the phy driver being unloaded
>> before the try_module_get() obtains a reference.
>> If it isn't the phy driver then there ought to be a reference count obtained
>> when the phy driver is located (by whatever decides which phy driver to use).
>> Even if that code later releases its reference (it probably shouldn't on success)
>> then you can't fail to get an extra reference here.
> 
> [Mao Wenan]Yes, this is phy code, in function phy_attach_direct(), drivers/net/phy/phy_device.c.
> when one NIC driver to do probe behavior, it will attach one matched phy driver. phy_attach_direct()
> is to obtain phy driver reference and bind phy driver, if try_module_get() execute on success, the reference
> count is added; if failed, the driver can't be attached to this NIC, and it can't added the phy driver
> reference count. So before try_module_get obtains a reference, phy driver can't can't be bound to this NIC.
> when the phy driver is attached to NIC, the reference count is added, if someone remove phy driver directly,
> it will be failed because reference count is not equal to 0.
> 
> An example of call trace when there is NIC driver to attch one phy driver:
> hns_nic_dev_probe->hns_nic_try_get_ae->hns_nic_init_phy->of_phy_connect->phy_connect_direct->phy_attach_direct
> 
> Consider the steps of phy driver(marvell.ko) added and removed, and NIC driver(hns_enet_drv.ko) added and removed:
> 1)insmod marvell       ref=0
> 2)insmod hns_enet_drv  ref=1
> 3)rmmod marvell        (should not on success, ref=1)
> 4)rmmod hns_enet_drv   ref=0
> 5)rmmod marvell        (should on success, because ref=0)
> 
> if we don't add the reference count in phy_attach_direct(the second step ref=0), so the third step rmmod marvell will
> be panic, because there is one user remain use marvell driver and phy_stat_machine use the NULL drv pointer.
> 
>>
>>> +
>>>  	get_device(d);
>>>
>>>  	/* Assume that if there is no driver, that it doesn't
>>> @@ -921,6 +926,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>>
>>>  error:
>>>  	put_device(d);
>>> +	module_put(d->driver->owner);
>>
>> Are those two in the wrong order ?
>>
>>>  	module_put(bus->owner);
>>>  	return err;
>>>  }
>>> @@ -998,6 +1004,7 @@ void phy_detach(struct phy_device *phydev)
>>>  	bus = phydev->mdio.bus;
>>>
>>>  	put_device(&phydev->mdio.dev);
>>> +	module_put(phydev->mdio.dev.driver->owner);
>>>  	module_put(bus->owner);
>>
>> Where is this code called from?
>> You can't call it from the phy driver because the driver can be unloaded
>> as soon as the last reference is removed.
>> At that point the code memory is freed.
> 
> [Mao Wenan] it is called by NIC when it is removed, which aims to disconnect one bound phy driver. If this phy driver
> is not used for this NIC, reference count should be subtracted, and phy driver can be removed if there is no user.
> hns_nic_dev_remove->phy_disconnect->phy_detach
> 
> 
> 
>>
>>>  }
>>>  EXPORT_SYMBOL(phy_detach);
>>> --
>>> 2.7.0
>>>
>>
>>
>> .
>>

@Florian Fainelli, what's your comments about this patch?

^ permalink raw reply

* Re: [iproute2 net-next 1/8] lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
From: Daniel Borkmann @ 2016-12-12  9:14 UTC (permalink / raw)
  To: David Ahern, netdev, stephen
In-Reply-To: <1481503995-24825-2-git-send-email-dsa@cumulusnetworks.com>

On 12/12/2016 01:53 AM, David Ahern wrote:
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [iproute2 net-next 2/8] bpf: export bpf_prog_load
From: Daniel Borkmann @ 2016-12-12  9:14 UTC (permalink / raw)
  To: David Ahern, netdev, stephen
In-Reply-To: <1481503995-24825-3-git-send-email-dsa@cumulusnetworks.com>

On 12/12/2016 01:53 AM, David Ahern wrote:
> Code move only; no functional change intended.
>
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [iproute2 net-next 3/8] bpf: Add BPF_ macros
From: Daniel Borkmann @ 2016-12-12  9:15 UTC (permalink / raw)
  To: David Ahern, netdev, stephen
In-Reply-To: <1481503995-24825-4-git-send-email-dsa@cumulusnetworks.com>

On 12/12/2016 01:53 AM, David Ahern wrote:
> Based on version in kernel repo, samples/bpf/libbpf.h
>
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [PATCH v3 0/4] vsock: cancel connect packets when failing to connect
From: Peng Tao @ 2016-12-12  9:17 UTC (permalink / raw)
  To: Stefan Hajnoczi, Jorgen Hansen
  Cc: netdev@vger.kernel.org, virtualization, David Miller,
	Stefan Hajnoczi, kvm
In-Reply-To: <20161209101842.GD18260@stefanha-x1.localdomain>

On Fri, Dec 9, 2016 at 6:18 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Dec 09, 2016 at 01:12:32AM +0800, Peng Tao wrote:
>> Currently, if a connect call fails on a signal or timeout (e.g., guest is still
>> in the process of starting up), we'll just return to caller and leave the connect
>> packet queued and they are sent even though the connection is considered a failure,
>> which can confuse applications with unwanted false connect attempt.
>>
>> The patchset enables vsock (both host and guest) to cancel queued packets when
>> a connect attempt is considered to fail.
>>
>> v3 changelog:
>>   - define cancel_pkt callback in struct vsock_transport rather than struct virtio_transport
>>   - rename virtio_vsock_pkt->vsk to virtio_vsock_pkt->cancel_token
>> v2 changelog:
>>   - fix queued_replies counting and resume tx/rx when necessary
>>
>>
>> Peng Tao (4):
>>   vsock: track pkt owner vsock
>>   vhost-vsock: add pkt cancel capability
>>   vsock: add pkt cancel capability
>>   vsock: cancel packets when failing to connect
>>
>>  drivers/vhost/vsock.c                   | 41 ++++++++++++++++++++++++++++++++
>>  include/linux/virtio_vsock.h            |  2 ++
>>  include/net/af_vsock.h                  |  3 +++
>>  net/vmw_vsock/af_vsock.c                | 14 +++++++++++
>>  net/vmw_vsock/virtio_transport.c        | 42 +++++++++++++++++++++++++++++++++
>>  net/vmw_vsock/virtio_transport_common.c |  7 ++++++
>>  6 files changed, 109 insertions(+)
>
> I'm happy although I pointed out two unnecessary (void*) casts.
>
> Please wait for Jorgen to go happy on the af_vsock.c changes before
> applying.
Thanks for reviewing!

Jorgen, would you please see if the changes to af_vsock.c is OK to you?

Cheers,
Tao

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: Jesper Dangaard Brouer @ 2016-12-12  9:40 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: netdev@vger.kernel.org, linux-mm, John Fastabend,
	Willem de Bruijn, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Mel Gorman, Tom Herbert, Brenden Blanco,
	Tariq Toukan, Saeed Mahameed, Jesse Brandeburg, Kalman Meth,
	brouer
In-Reply-To: <20161212083812.GA19987@rapoport-lnx>


On Mon, 12 Dec 2016 10:38:13 +0200 Mike Rapoport <rppt@linux.vnet.ibm.com> wrote:

> Hello Jesper,
> 
> On Mon, Dec 05, 2016 at 03:31:32PM +0100, Jesper Dangaard Brouer wrote:
> > Hi all,
> > 
> > This is my design for how to safely handle RX zero-copy in the network
> > stack, by using page_pool[1] and modifying NIC drivers.  Safely means
> > not leaking kernel info in pages mapped to userspace and resilience
> > so a malicious userspace app cannot crash the kernel.
> > 
> > Design target
> > =============
> > 
> > Allow the NIC to function as a normal Linux NIC and be shared in a
> > safe manor, between the kernel network stack and an accelerated
> > userspace application using RX zero-copy delivery.
> > 
> > Target is to provide the basis for building RX zero-copy solutions in
> > a memory safe manor.  An efficient communication channel for userspace
> > delivery is out of scope for this document, but OOM considerations are
> > discussed below (`Userspace delivery and OOM`_).  
> 
> Sorry, if this reply is a bit off-topic.

It is very much on topic IMHO :-)

> I'm working on implementation of RX zero-copy for virtio and I've dedicated
> some thought about making guest memory available for physical NIC DMAs.
> I believe this is quite related to your page_pool proposal, at least from
> the NIC driver perspective, so I'd like to share some thoughts here.

Seems quite related. I'm very interested in cooperating with you! I'm
not very familiar with virtio, and how packets/pages gets channeled
into virtio.

> The idea is to dedicate one (or more) of the NIC's queues to a VM, e.g.
> using macvtap, and then propagate guest RX memory allocations to the NIC
> using something like new .ndo_set_rx_buffers method.

I believe the page_pool API/design aligns with this idea/use-case.

> What is your view about interface between the page_pool and the NIC
> drivers?

In my Prove-of-Concept implementation, the NIC driver (mlx5) register
a page_pool per RX queue.  This is done for two reasons (1) performance
and (2) for supporting use-cases where only one single RX-ring queue is
(re)configured to support RX-zero-copy.  There are some associated
extra cost of enabling this mode, thus it makes sense to only enable it
when needed.

I've not decided how this gets enabled, maybe some new driver NDO.  It
could also happen when a XDP program gets loaded, which request this
feature.

The macvtap solution is nice and we should support it, but it requires
VM to have their MAC-addr registered on the physical switch.  This
design is about adding flexibility. Registering an XDP eBPF filter
provides the maximum flexibility for matching the destination VM.


> Have you considered using "push" model for setting the NIC's RX memory?

I don't understand what you mean by a "push" model?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: netlink: GPF in sock_sndtimeo
From: Richard Guy Briggs @ 2016-12-12 10:02 UTC (permalink / raw)
  To: Cong Wang
  Cc: linux-audit, Paul Moore, Dmitry Vyukov, David Miller,
	Johannes Berg, Florian Westphal, Eric Dumazet, Herbert Xu, netdev,
	LKML, syzkaller
In-Reply-To: <CAM_iQpV2GuKhR_1tD5jjACeD+pajJLws08CLmeYAo+rsjxvB0A@mail.gmail.com>

On 2016-12-09 20:13, Cong Wang wrote:
> On Fri, Dec 9, 2016 at 3:01 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2016-12-08 22:57, Cong Wang wrote:
> >> On Thu, Dec 8, 2016 at 10:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> >> > I also tried to extend Cong Wang's idea to attempt to proactively respond to a
> >> > NETLINK_URELEASE on the audit_sock and reset it, but ran into a locking error
> >> > stack dump using mutex_lock(&audit_cmd_mutex) in the notifier callback.
> >> > Eliminating the lock since the sock is dead anways eliminates the error.
> >> >
> >> > Is it safe?  I'll resubmit if this looks remotely sane.  Meanwhile I'll try to
> >> > get the test case to compile.
> >>
> >> It doesn't look safe, because 'audit_sock', 'audit_nlk_portid' and 'audit_pid'
> >> are updated as a whole and race between audit_receive_msg() and
> >> NETLINK_URELEASE.
> >
> > This is what I expected and why I originally added the mutex lock in the
> > callback...  The dumps I got were bare with no wrapper identifying the
> > process context or specific error, so I'm at a bit of a loss how to
> > solve this (without thinking more about it) other than instinctively
> > removing the mutex.
> 
> Netlink notifier can safely be converted to blocking one, I will send
> a patch.

I had a quick look at how that might happen.  The netlink notifier chain
is atomic.  Would the registered callback funciton need to spawn a
one-time thread to avoid blocking?

> But I seriously doubt you really need NETLINK_URELEASE here,
> it adds nothing but overhead, b/c the netlink notifier is called on
> every netlink socket in the system, but for net exit path, that is
> relatively a slow path.

I was a bit concerned about its overhead, but was hoping to update
audit_sock more quickly in the case of a sock shutting down for any
reason.

> Also, kauditd_send_skb() needs audit_cmd_mutex too.

Agreed.

> I will send a formal patch.

I had a look at your patch.  It looks attractively simple.  The audit
next tree has patches queued that add an audit_reset function that will
require more work.  I still see some potential gaps.

- If the process messes up (or the sock lookup messes up) it is reset
  in the kauditd thread under the audit_cmd_mutex.

- If the process exits normally or is replaced due to an audit_replace
  error, it is reset from audit_receive_skb under the audit_cmd_mutex.

- If the process dies before the kauditd thread notices, either reap it
  via notifier callback or it needs a check on net exit to reset.  This
  last one appears necessary to decrement the sock refcount so the sock
  can be released in netlink_kernel_release().

If we want to be proactive and use the netlink notifier, we assume the
overhead of adding to the netlink notifier chain and eliminate all the
other reset calls under the kauditd thread.  If we are ok being
reactionary, then we'll at least need the net exit check on audit_sock.

Have I understood this correctly?

I'll follow with a patch based on audit#next

There will be an upstream merge conflict between audit#next and net#next
due to the removal of:
	RCU_INIT_POINTER(aunet->nlsk, NULL);                                                        
	synchronize_net();
from the end of audit_net_exit().  This patch should probably go through
the audit maintainer due to the other anticipated merge conflicts.

> Thanks.

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Kernel Security Engineering, Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* [PATCH v2] audit: use proper refcount locking on audit_sock
From: Richard Guy Briggs @ 2016-12-12 10:03 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-audit
  Cc: Richard Guy Briggs, dvyukov, xiyou.wangcong, edumazet, eparis,
	pmoore, sgrubb
In-Reply-To: <20161212100215.GA1305@madcap2.tricolour.ca>

Resetting audit_sock appears to be racy.

audit_sock was being copied and dereferenced without using a refcount on
the source sock.

Bump the refcount on the underlying sock when we store a refrence in
audit_sock and release it when we reset audit_sock.  audit_sock
modification needs the audit_cmd_mutex.

See: https://lkml.org/lkml/2016/11/26/232

Thanks to Eric Dumazet <edumazet@google.com> and Cong Wang
<xiyou.wangcong@gmail.com> on ideas how to fix it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
There has been a lot of change in the audit code that is about to go
upstream to address audit queue issues.  This patch is based on the
source tree: git://git.infradead.org/users/pcmoore/audit#next
---
 kernel/audit.c |   34 ++++++++++++++++++++++++++++------
 1 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index f20eee0..439f7f3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -452,7 +452,9 @@ static void auditd_reset(void)
 	struct sk_buff *skb;
 
 	/* break the connection */
+	sock_put(audit_sock);
 	audit_pid = 0;
+	audit_nlk_portid = 0;
 	audit_sock = NULL;
 
 	/* flush all of the retry queue to the hold queue */
@@ -478,6 +480,12 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb)
 	if (rc >= 0) {
 		consume_skb(skb);
 		rc = 0;
+	} else {
+		if (rc & (-ENOMEM|-EPERM|-ECONNREFUSED)) {
+			mutex_lock(&audit_cmd_mutex);
+			auditd_reset();
+			mutex_unlock(&audit_cmd_mutex);
+		}
 	}
 
 	return rc;
@@ -579,7 +587,9 @@ static int kauditd_thread(void *dummy)
 
 				auditd = 0;
 				if (AUDITD_BAD(rc, reschedule)) {
+					mutex_lock(&audit_cmd_mutex);
 					auditd_reset();
+					mutex_unlock(&audit_cmd_mutex);
 					reschedule = 0;
 				}
 			} else
@@ -594,7 +604,9 @@ static int kauditd_thread(void *dummy)
 				auditd = 0;
 				if (AUDITD_BAD(rc, reschedule)) {
 					kauditd_hold_skb(skb);
+					mutex_lock(&audit_cmd_mutex);
 					auditd_reset();
+					mutex_unlock(&audit_cmd_mutex);
 					reschedule = 0;
 				} else
 					/* temporary problem (we hope), queue
@@ -623,7 +635,9 @@ quick_loop:
 				if (rc) {
 					auditd = 0;
 					if (AUDITD_BAD(rc, reschedule)) {
+						mutex_lock(&audit_cmd_mutex);
 						auditd_reset();
+						mutex_unlock(&audit_cmd_mutex);
 						reschedule = 0;
 					}
 
@@ -1004,17 +1018,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				return -EACCES;
 			}
 			if (audit_pid && new_pid &&
-			    audit_replace(requesting_pid) != -ECONNREFUSED) {
+			    (audit_replace(requesting_pid) & (-ECONNREFUSED|-EPERM|-ENOMEM))) {
 				audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
 				return -EEXIST;
 			}
 			if (audit_enabled != AUDIT_OFF)
 				audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
-			audit_pid = new_pid;
-			audit_nlk_portid = NETLINK_CB(skb).portid;
-			audit_sock = skb->sk;
-			if (!new_pid)
+			if (new_pid) {
+				if (audit_sock)
+					sock_put(audit_sock);
+				audit_pid = new_pid;
+				audit_nlk_portid = NETLINK_CB(skb).portid;
+				sock_hold(skb->sk);
+				audit_sock = skb->sk;
+			} else {
 				auditd_reset();
+			}
 			wake_up_interruptible(&kauditd_wait);
 		}
 		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1283,8 +1302,11 @@ static void __net_exit audit_net_exit(struct net *net)
 {
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 	struct sock *sock = aunet->nlsk;
-	if (sock == audit_sock)
+	if (sock == audit_sock) {
+		mutex_lock(&audit_cmd_mutex);
 		auditd_reset();
+		mutex_unlock(&audit_cmd_mutex);
+	}
 
 	RCU_INIT_POINTER(aunet->nlsk, NULL);
 	synchronize_net();
-- 
1.7.1

^ permalink raw reply related

* Re: netlink: GPF in sock_sndtimeo
From: Dmitry Vyukov @ 2016-12-12 10:07 UTC (permalink / raw)
  To: syzkaller
  Cc: Richard Guy Briggs, linux-audit, Paul Moore, David Miller,
	Johannes Berg, Florian Westphal, Eric Dumazet, Herbert Xu, netdev,
	LKML
In-Reply-To: <CAM_iQpVcHGywXn90EpiSz-LsUDgKVqs-7BY-L7UBCu2VxkC31Q@mail.gmail.com>

On Sat, Dec 10, 2016 at 8:40 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>> On 2016-12-08 22:57, Cong Wang wrote:
>>>> On Thu, Dec 8, 2016 at 10:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
>>>> > I also tried to extend Cong Wang's idea to attempt to proactively respond to a
>>>> > NETLINK_URELEASE on the audit_sock and reset it, but ran into a locking error
>>>> > stack dump using mutex_lock(&audit_cmd_mutex) in the notifier callback.
>>>> > Eliminating the lock since the sock is dead anways eliminates the error.
>>>> >
>>>> > Is it safe?  I'll resubmit if this looks remotely sane.  Meanwhile I'll try to
>>>> > get the test case to compile.
>>>>
>>>> It doesn't look safe, because 'audit_sock', 'audit_nlk_portid' and 'audit_pid'
>>>> are updated as a whole and race between audit_receive_msg() and
>>>> NETLINK_URELEASE.
>>>
>>> This is what I expected and why I originally added the mutex lock in the
>>> callback...  The dumps I got were bare with no wrapper identifying the
>>> process context or specific error, so I'm at a bit of a loss how to
>>> solve this (without thinking more about it) other than instinctively
>>> removing the mutex.
>>
>> Netlink notifier can safely be converted to blocking one, I will send
>> a patch.
>>
>> But I seriously doubt you really need NETLINK_URELEASE here,
>> it adds nothing but overhead, b/c the netlink notifier is called on
>> every netlink socket in the system, but for net exit path, that is
>> relatively a slow path.
>>
>> Also, kauditd_send_skb() needs audit_cmd_mutex too.
>
> Please let me know what you think about the attached patch?

Applied the patch locally and have not seen the bug since then (~24
hours of testing).

^ permalink raw reply

* Re: Synopsys Ethernet QoS
From: Joao Pinto @ 2016-12-12 10:19 UTC (permalink / raw)
  To: Florian Fainelli, Andy Shevchenko
  Cc: David Miller, Joao Pinto, Giuseppe CAVALLARO, lars.persson,
	rabin.vincent, netdev, CARLOS.PALMINHA, Jie.Deng1
In-Reply-To: <556353b7-c847-7549-626d-3c324063647e@gmail.com>

Hi,

Às 1:44 AM de 12/10/2016, Florian Fainelli escreveu:
> Le 12/09/16 à 16:16, Andy Shevchenko a écrit :
>> On Sat, Dec 10, 2016 at 12:52 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
>>
>>> It's kind of sad that customers of that IP (stmmac, amd-xgbe, sxgbe)
>>
>>> did
>>> actually pioneer the upstreaming effort, but it is good to see people
>>> from Synopsys willing to fix that in the future.
>>
>> Wait, you would like to tell that we have more than 2 drivers for the
>> same (okay, same vendor) IP?!
>> It's better to unify them earlier, than have n+ copies.
> 
> Unfortunately that is the case, see this email:
> 
> https://www.mail-archive.com/netdev@vger.kernel.org/msg142796.html
> 
> dwc_eth_qos and stmmac have some overlap. There seems to be work
> underway to unify these two to begin with.
> 
>>
>> P.S. Though, I don't see how sxgbe got in the list. First glance on
>> the code doesn't show similarities.
> 
> Well samsung/sxgbe looks potentially similar to amd/xgbe, but that's
> just my cursory look at the code, it may very well be something entirely
> different. The descriptor formats just look suspiciously similar.
> 

Thank you for your inputs! Renaming seems to be a hotspot. I agree that maybe
instead of renaming (breaking retro-compatibility as David and Florian
mentioned), the best is to move stmmac to synopsys/ after merging *qos* and
removing it. As Florian mentioned, git is capable of detecting folder restructured.

@Rabin Vincent: Hi Rabin. Since Axis is more familiar with the synopsys/*qos*
driver would it be possible for you to make an initial analysis of what has to
be merged into Stmmac? This way the development would speed-up.

Thanks to all.

Joao

^ permalink raw reply

* RE: [RFC PATCH net-next v3 1/2] macb: Add 1588 support in Cadence GEM.
From: Andrei.Pistirica @ 2016-12-12 10:22 UTC (permalink / raw)
  To: richardcochran, harini.katakam, rafalo
  Cc: netdev, linux-kernel, linux-arm-kernel, davem, nicolas.ferre,
	harinikatakamlinux, punnaia, michals, anirudh, boris.brezillon,
	alexandre.belloni, tbultel
In-Reply-To: <BN3PR07MB2516992DEE883FAD6DC3ED08C9870@BN3PR07MB2516.namprd07.prod.outlook.com>



> -----Original Message-----
> From: Rafal Ozieblo [mailto:rafalo@cadence.com]
> Sent: Friday, December 09, 2016 11:20 AM
> To: Andrei Pistirica - M16132; richardcochran@gmail.com
> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; davem@davemloft.net;
> nicolas.ferre@atmel.com; harinikatakamlinux@gmail.com;
> harini.katakam@xilinx.com; punnaia@xilinx.com; michals@xilinx.com;
> anirudh@xilinx.com; boris.brezillon@free-electrons.com;
> alexandre.belloni@free-electrons.com; tbultel@pixelsurmer.com
> Subject: RE: [RFC PATCH net-next v3 1/2] macb: Add 1588 support in Cadence
> GEM.
> 
> -----Original Message-----
> > From: Andrei.Pistirica@microchip.com
> > [mailto:Andrei.Pistirica@microchip.com]
> > Sent: 8 grudnia 2016 15:42
> > To: richardcochran@gmail.com
> > Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > linux-arm-kernel@lists.infradead.org; davem@davemloft.net;
> > nicolas.ferre@atmel.com; harinikatakamlinux@gmail.com;
> > harini.katakam@xilinx.com; punnaia@xilinx.com; michals@xilinx.com;
> > anirudh@xilinx.com; boris.brezillon@free-electrons.com;
> > alexandre.belloni@free-electrons.com; tbultel@pixelsurmer.com; Rafal
> > Ozieblo
> > Subject: RE: [RFC PATCH net-next v3 1/2] macb: Add 1588 support in
> Cadence GEM.
> >
> >
> >
> > > -----Original Message-----
> > > From: Richard Cochran [mailto:richardcochran@gmail.com]
> > > Sent: Wednesday, December 07, 2016 11:04 PM
> > > To: Andrei Pistirica - M16132
> > > Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> > > kernel@lists.infradead.org; davem@davemloft.net;
> > > nicolas.ferre@atmel.com; harinikatakamlinux@gmail.com;
> > > harini.katakam@xilinx.com; punnaia@xilinx.com; michals@xilinx.com;
> > > anirudh@xilinx.com; boris.brezillon@free-electrons.com;
> > > alexandre.belloni@free-electrons.com; tbultel@pixelsurmer.com;
> > > rafalo@cadence.com
> > > Subject: Re: [RFC PATCH net-next v3 1/2] macb: Add 1588 support in
> > > Cadence GEM.
> > >
> > > On Wed, Dec 07, 2016 at 08:39:09PM +0100, Richard Cochran wrote:
> > > > > +static s32 gem_ptp_max_adj(unsigned int f_nom) {
> > > > > +       u64 adj;
> > > > > +
> > > > > +       /* The 48 bits of seconds for the GEM overflows every:
> > > > > +        * 2^48/(365.25 * 24 * 60 *60) =~ 8 925 512 years (~= 9 mil years),
> > > > > +        * thus the maximum adjust frequency must not overflow
> > > > > + CNS
> > > register:
> > > > > +        *
> > > > > +        * addend  = 10^9/nominal_freq
> > > > > +        * adj_max = +/- addend*ppb_max/10^9
> > > > > +        * max_ppb = (2^8-1)*nominal_freq-10^9
> > > > > +        */
> > > > > +       adj = f_nom;
> > > > > +       adj *= 0xffff;
> > > > > +       adj -= 1000000000ULL;
> > > >
> > > > What is this computation, and how does it relate to the comment?
> >
> > I considered the following simple equation: increment value at nominal
> frequency (which is 10^9/nominal frequency nsecs) + the maximum drift
> value (nsecs) <= maximum increment value at nominal frequency (which is
> 8bit:0xffff).
> > If maximum drift is written as function of nominal frequency and
> maximum ppb, then the equation above yields that the maximum ppb is:
> (2^8 - 1) *nominal_frequency - 10^9. The equation is also simplified by the
> fact that the drift is written as ppm + 16bit_fractions and the increment
> value is written as nsec + 16bit_fractions.
> >
> > Rafal said that this value is hardcoded: 0x64E6, while Harini said:
> 250000000.
> 
> To clarify a little bit. In my reference code this value (0x64E6) was taken
> from our legacy code. It was used for testing only. I know it should be
> change to something more accurate. This is the reason why I asked how did
> you count it (250000000). According to our calculations this value depends
> on actual set period (incr_ns and incr_sub_ns) and min and max value we
> can set. The calculation were a little bit intricate, so we decided to leave it
> as it was.
> 
> >
> > I need to dig into this...
> >
> > >
> > > I am not sure what you meant, but it sounds like you are on the wrong
> track.
> > > Let me explain...
> >
> > Thanks.
> >
> > >
> > > The max_adj has nothing at all to do with the width of the time register.
> > > Rather, it should reflect the maximum possible change in the tuning
> word.
> > >
> > > For example, with a nominal 8 ns period, the tuning word is 0x80000.
> > > Looking at running the clock more slowly, the slowest possible word
> > > is 0x00001, meaning a difference of 0x7FFFF.  This implies an
> > > adjustment of
> > > 0x7FFFF/0x80000 or 999998092 ppb.  Running more quickly, we can
> > > already have 0x100000, twice as fast, or just under 2 billion ppb.
> > >
> > > You should consider the extreme cases to determine the most limited
> > > (smallest) max_adj value:
> > >
> > > Case 1 - high frequency
> > > ~~~~~~~~~~~~~~~~~~~~~~~
> > >
> > > With a nominal 1 ns period, we have the nominal tuning word 0x10000.
> > > The smallest is 0x1 for a difference of 0xFFFF.  This corresponds to
> > > an adjustment of 0xFFFF/0x10000 = .9999847412109375 or 999984741 ppb.
> > >
> > > Case 2 - low frequency
> > > ~~~~~~~~~~~~~~~~~~~~~~
> > >
> > > With a nominal 255 ns period, the nominal word is 0xFF0000, the
> > > largest 0xFFFFFF, and the difference is 0xFFFF.  This corresponds to
> > > and adjustment of 0xFFFF/0xFF0000 = .0039215087890625 or 3921508 ppb.
> > >
> > > Since 3921508 ppb is a huge adjustment, you can simply use that as a
> > > safe maximum, ignoring the actual input clock.
> > >
> > > Thanks,
> > > Richard
> > >
> > >
> >
> > Regards,
> > Andrei
> >
> 
> Best regards,
> Rafal Ozieblo   |   Firmware System Engineer,
> phone nbr.: +48 32 5085469
> www.cadence.com

Hi Guys,

Based on Richard's input, this is what I want to do for our platforms:

struct macb_ptp_info {
        void (*ptp_init)(struct net_device *ndev);
        void (*ptp_remove)(struct net_device *ndev);
+       s32 (*get_ptp_max_adj)(void);
        unsigned int (*get_tsu_rate)(struct macb *bp);
        int (*get_ts_info)(struct net_device *dev,
                           struct ethtool_ts_info *info);
       int (*get_hwtst)(struct net_device *netdev,
                         struct ifreq *ifr);
       int (*set_hwtst)(struct net_device *netdev,
                         struct ifreq *ifr, int cmd); 
};

+static s32 gem_get_ptp_max_adj(void)
+{
+       return 3921508;
+}

 static struct macb_ptp_info gem_ptp_info = {
       .ptp_init        = gem_ptp_init,
       .ptp_remove      = gem_ptp_remove,
+       .get_ptp_max_adj = gem_get_ptp_max_adj,
       .get_tsu_rate    = gem_get_tsu_rate,
       .get_ts_info     = gem_get_ts_info,
       .get_hwtst       = gem_get_hwtst,
       .set_hwtst       = gem_set_hwtst,
 };

[...]
void gem_ptp_init(struct net_device *ndev)
 {
[...]
        /* nominal frequency and maximum adjustment in ppb */
        bp->tsu_rate = bp->ptp_info->get_tsu_rate(bp);
+       bp->ptp_caps.max_adj = bp->ptp_info->get_ptp_max_adj();
[...]
}

Richard, are you agree with this?

Harini, you can fill the callback with the value for your platform. Tell me if you are ok with function's signature.

Regards,
Andrei

^ permalink raw reply

* Re: 4.9.0-rc8: tg3 dead after resume
From: Siva Reddy Kallam @ 2016-12-12 10:23 UTC (permalink / raw)
  To: Billy Shuman; +Cc: Michael Chan, Netdev

On Fri, Dec 9, 2016 at 7:59 PM, Billy Shuman <wshuman3@gmail.com> wrote:
> On Thu, Dec 8, 2016 at 4:03 AM, Siva Reddy Kallam
> <siva.kallam@broadcom.com> wrote:
>> On Thu, Dec 8, 2016 at 12:14 AM, Billy Shuman <wshuman3@gmail.com> wrote:
>>> On Wed, Dec 7, 2016 at 12:37 PM, Michael Chan <michael.chan@broadcom.com> wrote:
>>>> On Wed, Dec 7, 2016 at 7:20 AM, Billy Shuman <wshuman3@gmail.com> wrote:
>>>>> After resume on 4.9.0-rc8 tg3 is dead.
>>>>>
>>>>> In logs I see:
>>>>> kernel: tg3 0000:44:00.0: phy probe failed, err -19
>>>>> kernel: tg3 0000:44:00.0: Problem fetching invariants of chip, aborting
>>>>
>>>> -19 is -ENODEV which means tg3 cannot read the PHY ID.
>>>>
>>>> If it's a true suspend/resume operation, the driver does not have to
>>>> go through probe during resume.  Please explain how you do
>>>> suspend/resume.
>>>>
>>>
>>> Sorry my previous message was accidentally sent to early.
>>>
>>> I used systemd (systemctl suspend) to suspend.
>>>
>> We need more information to proceed further.
>> Without suspend, Are you able to use the tg3 port?
>
> Yes the port works fine without suspend.
OK
>
>> Which Broadcom card are you having in laptop?
>
> The nic is a NetXtreme BCM57762 Gigabit Ethernet PCIe in a thunderbolt3 dock.
>
OK
>> Please provide complete tg3 specific logs in dmesg.
>>
>
> [   32.084010] tg3.c:v3.137 (May 11, 2014)
> [   32.124695] tg3 0000:44:00.0 eth0: Tigon3 [partno(BCM957762) rev
> 57766001] (PCI Express) MAC address 98:e7:f4:8b:13:19
> [   32.124698] tg3 0000:44:00.0 eth0: attached PHY is 57765
> (10/100/1000Base-T Ethernet) (WireSpeed[1], EEE[1])
> [   32.124699] tg3 0000:44:00.0 eth0: RXcsums[1] LinkChgREG[0]
> MIirq[0] ASF[0] TSOcap[1]
> [   32.124700] tg3 0000:44:00.0 eth0: dma_rwctrl[00000001] dma_mask[64-bit]
> [   32.219764] tg3 0000:44:00.0 enp68s0: renamed from eth0
> [   36.219245] tg3 0000:44:00.0 enp68s0: Link is up at 1000 Mbps, full duplex
> [   36.219250] tg3 0000:44:00.0 enp68s0: Flow control is on for TX and on for RX
> [   36.219251] tg3 0000:44:00.0 enp68s0: EEE is disabled
>
> after resume
> [   92.292838] tg3 0000:44:00.0 enp68s0: No firmware running
> [   93.521744] tg3 0000:44:00.0: tg3_abort_hw timed out,
> TX_MODE_ENABLE will not clear MAC_TX_MODE=ffffffff
> [  106.704655] tg3 0000:44:00.0 enp68s0: Link is down
> [  108.370356] tg3 0000:44:00.0: tg3_abort_hw timed out,
> TX_MODE_ENABLE will not clear MAC_TX_MODE=ffffffff
>
> after rmmod, modprobe
> [  570.933636] tg3 0000:44:00.0: tg3_abort_hw timed out,
> TX_MODE_ENABLE will not clear MAC_TX_MODE=ffffffff
> [  604.847215] tg3.c:v3.137 (May 11, 2014)
> [  605.010075] tg3 0000:44:00.0: phy probe failed, err -19
> [  605.010077] tg3 0000:44:00.0: Problem fetching invariants of chip, aborting
>
>
>
>
We will try to reproduce and update you on this.
>>>> Did this work before?  There has been very few changes to tg3 recently.
>>>>
>>>
>>> This is a new laptop for me, but the same behavior is seen on 4.4.36 and 4.8.12.
>>>
>>>>>
>>>>> rmmod and modprobe does not fix the problem only a reboot resolves the issue.
>>>>>
>>>>> Billy

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox