Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2 3/3] ss: add AF_VSOCK support
From: Stefan Hajnoczi @ 2017-10-03 17:57 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Jorgen Hansen, Dexuan Cui, Stefan Hajnoczi
In-Reply-To: <20171003175744.24987-1-stefanha@redhat.com>

The AF_VSOCK address family is a host<->guest communications channel
supported by VMware, KVM, and Hyper-V.  Initial VMware support was
released in Linux 3.9 in 2013 and transports for other hypervisors were
added later.

AF_VSOCK addresses are <u32 cid, u32 port> tuples.  The 32-bit cid
integer is comparable to an IP address.  AF_VSOCK ports work like
TCP/UDP ports.

Both SOCK_STREAM and SOCK_DGRAM socket types are available.

This patch adds AF_VSOCK support to ss(8) so that sockets can be
observed.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 misc/ss.c     | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 man/man8/ss.8 |   8 ++-
 2 files changed, 188 insertions(+), 4 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 12a31c90..164356a0 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -44,6 +44,7 @@
 #include <linux/packet_diag.h>
 #include <linux/netlink_diag.h>
 #include <linux/sctp.h>
+#include <linux/vm_sockets_diag.h>
 
 #define MAGIC_SEQ 123456
 
@@ -126,6 +127,8 @@ enum {
 	PACKET_R_DB,
 	NETLINK_DB,
 	SCTP_DB,
+	VSOCK_ST_DB,
+	VSOCK_DG_DB,
 	MAX_DB
 };
 
@@ -134,6 +137,7 @@ enum {
 #define ALL_DB ((1<<MAX_DB)-1)
 #define INET_L4_DBM ((1<<TCP_DB)|(1<<UDP_DB)|(1<<DCCP_DB)|(1<<SCTP_DB))
 #define INET_DBM (INET_L4_DBM | (1<<RAW_DB))
+#define VSOCK_DBM ((1<<VSOCK_ST_DB)|(1<<VSOCK_DG_DB))
 
 enum {
 	SS_UNKNOWN,
@@ -222,6 +226,14 @@ static const struct filter default_dbs[MAX_DB] = {
 		.states   = SS_CONN,
 		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
+	[VSOCK_ST_DB] = {
+		.states   = SS_CONN,
+		.families = FAMILY_MASK(AF_VSOCK),
+	},
+	[VSOCK_DG_DB] = {
+		.states   = SS_CONN,
+		.families = FAMILY_MASK(AF_VSOCK),
+	},
 };
 
 static const struct filter default_afs[AF_MAX] = {
@@ -245,6 +257,10 @@ static const struct filter default_afs[AF_MAX] = {
 		.dbs    = (1 << NETLINK_DB),
 		.states = (1 << SS_CLOSE),
 	},
+	[AF_VSOCK] = {
+		.dbs    = VSOCK_DBM,
+		.states = SS_CONN,
+	},
 };
 
 static int do_default = 1;
@@ -283,6 +299,8 @@ static void filter_default_dbs(struct filter *f)
 	filter_db_set(f, PACKET_DG_DB);
 	filter_db_set(f, NETLINK_DB);
 	filter_db_set(f, SCTP_DB);
+	filter_db_set(f, VSOCK_ST_DB);
+	filter_db_set(f, VSOCK_DG_DB);
 }
 
 static void filter_states_set(struct filter *f, int states)
@@ -791,6 +809,18 @@ static const char *proto_name(int protocol)
 	return "???";
 }
 
+static const char *vsock_netid_name(int type)
+{
+	switch (type) {
+	case SOCK_STREAM:
+		return "v_str";
+	case SOCK_DGRAM:
+		return "v_dgr";
+	default:
+		return "???";
+	}
+}
+
 static void sock_state_print(struct sockstat *s)
 {
 	const char *sock_name;
@@ -823,6 +853,9 @@ static void sock_state_print(struct sockstat *s)
 	case AF_NETLINK:
 		sock_name = "nl";
 		break;
+	case AF_VSOCK:
+		sock_name = vsock_netid_name(s->type);
+		break;
 	default:
 		sock_name = "unknown";
 	}
@@ -1149,6 +1182,8 @@ static int run_ssfilter(struct ssfilter *f, struct sockstat *s)
 			return s->lport == 0 && s->local.data[0] == 0;
 		if (s->local.family == AF_NETLINK)
 			return s->lport < 0;
+		if (s->local.family == AF_VSOCK)
+			return s->lport > 1023;
 
 		return is_ephemeral(s->lport);
 	}
@@ -1524,6 +1559,15 @@ void *parse_devcond(char *name)
 	return res;
 }
 
+static void vsock_set_inet_prefix(inet_prefix *a, __u32 cid)
+{
+	*a = (inet_prefix){
+		.bytelen = sizeof(cid),
+		.family = AF_VSOCK,
+	};
+	memcpy(a->data, &cid, sizeof(cid));
+}
+
 void *parse_hostcond(char *addr, bool is_port)
 {
 	char *port = NULL;
@@ -1598,6 +1642,37 @@ void *parse_hostcond(char *addr, bool is_port)
 		goto out;
 	}
 
+	if (fam == AF_VSOCK || strncmp(addr, "vsock:", 6) == 0) {
+		__u32 cid = ~(__u32)0;
+
+		a.addr.family = AF_VSOCK;
+		if (strncmp(addr, "vsock:", 6) == 0)
+			addr += 6;
+
+		if (is_port)
+			port = addr;
+		else {
+			port = strchr(addr, ':');
+			if (port) {
+				*port = '\0';
+				port++;
+			}
+		}
+
+		if (port && strcmp(port, "*") &&
+		    get_u32((__u32 *)&a.port, port, 0))
+			return NULL;
+
+		if (addr[0] && strcmp(addr, "*")) {
+			a.addr.bitlen = 32;
+			if (get_u32(&cid, addr, 0))
+				return NULL;
+		}
+		vsock_set_inet_prefix(&a.addr, cid);
+		fam = AF_VSOCK;
+		goto out;
+	}
+
 	if (fam == AF_INET || !strncmp(addr, "inet:", 5)) {
 		fam = AF_INET;
 		if (!strncmp(addr, "inet:", 5))
@@ -3674,6 +3749,88 @@ static int netlink_show(struct filter *f)
 	return 0;
 }
 
+static bool vsock_type_skip(struct sockstat *s, struct filter *f)
+{
+	if (s->type == SOCK_STREAM && !(f->dbs & (1 << VSOCK_ST_DB)))
+		return true;
+	if (s->type == SOCK_DGRAM && !(f->dbs & (1 << VSOCK_DG_DB)))
+		return true;
+	return false;
+}
+
+static void vsock_addr_print(inet_prefix *a, __u32 port)
+{
+	char cid_str[sizeof("4294967295")];
+	char port_str[sizeof("4294967295")];
+	__u32 cid;
+
+	memcpy(&cid, a->data, sizeof(cid));
+
+	if (cid == ~(__u32)0)
+		snprintf(cid_str, sizeof(cid_str), "*");
+	else
+		snprintf(cid_str, sizeof(cid_str), "%u", cid);
+
+	if (port == ~(__u32)0)
+		snprintf(port_str, sizeof(port_str), "*");
+	else
+		snprintf(port_str, sizeof(port_str), "%u", port);
+
+	sock_addr_print(cid_str, ":", port_str, NULL);
+}
+
+static void vsock_stats_print(struct sockstat *s, struct filter *f)
+{
+	sock_state_print(s);
+
+	vsock_addr_print(&s->local, s->lport);
+	vsock_addr_print(&s->remote, s->rport);
+
+	proc_ctx_print(s);
+
+	printf("\n");
+}
+
+static int vsock_show_sock(const struct sockaddr_nl *addr,
+			   struct nlmsghdr *nlh, void *arg)
+{
+	struct filter *f = (struct filter *)arg;
+	struct vsock_diag_msg *r = NLMSG_DATA(nlh);
+	struct sockstat stat = {
+		.type = r->vdiag_type,
+		.lport = r->vdiag_src_port,
+		.rport = r->vdiag_dst_port,
+		.state = r->vdiag_state,
+		.ino = r->vdiag_ino,
+	};
+
+	vsock_set_inet_prefix(&stat.local, r->vdiag_src_cid);
+	vsock_set_inet_prefix(&stat.remote, r->vdiag_dst_cid);
+
+	if (vsock_type_skip(&stat, f))
+		return 0;
+
+	if (f->f && run_ssfilter(f->f, &stat) == 0)
+		return 0;
+
+	vsock_stats_print(&stat, f);
+
+	return 0;
+}
+
+static int vsock_show(struct filter *f)
+{
+	DIAG_REQUEST(req, struct vsock_diag_req r);
+
+	if (!filter_af_get(f, AF_VSOCK))
+		return 0;
+
+	req.r.sdiag_family = AF_VSOCK;
+	req.r.vdiag_states = f->states;
+
+	return handle_netlink_request(f, &req.nlh, sizeof(req), vsock_show_sock);
+}
+
 struct sock_diag_msg {
 	__u8 sdiag_family;
 };
@@ -3694,6 +3851,8 @@ static int generic_show_sock(const struct sockaddr_nl *addr,
 		return packet_show_sock(addr, nlh, arg);
 	case AF_NETLINK:
 		return netlink_show_sock(addr, nlh, arg);
+	case AF_VSOCK:
+		return vsock_show_sock(addr, nlh, arg);
 	default:
 		return -1;
 	}
@@ -3921,14 +4080,15 @@ static void _usage(FILE *dest)
 "   -d, --dccp          display only DCCP sockets\n"
 "   -w, --raw           display only RAW sockets\n"
 "   -x, --unix          display only Unix domain sockets\n"
+"       --vsock         display only vsock sockets\n"
 "   -f, --family=FAMILY display sockets of type FAMILY\n"
-"       FAMILY := {inet|inet6|link|unix|netlink|help}\n"
+"       FAMILY := {inet|inet6|link|unix|netlink|vsock|help}\n"
 "\n"
 "   -K, --kill          forcibly close sockets, display what was closed\n"
 "   -H, --no-header     Suppress header line\n"
 "\n"
 "   -A, --query=QUERY, --socket=QUERY\n"
-"       QUERY := {all|inet|tcp|udp|raw|unix|unix_dgram|unix_stream|unix_seqpacket|packet|netlink}[,QUERY]\n"
+"       QUERY := {all|inet|tcp|udp|raw|unix|unix_dgram|unix_stream|unix_seqpacket|packet|netlink|vsock_stream|vsock_dgram}[,QUERY]\n"
 "\n"
 "   -D, --diag=FILE     Dump raw information about TCP sockets to FILE\n"
 "   -F, --filter=FILE   read filter information from FILE\n"
@@ -4001,6 +4161,9 @@ static int scan_state(const char *state)
 	exit(-1);
 }
 
+/* Values 'v' and 'V' are already used so a non-character is used */
+#define OPT_VSOCK 256
+
 static const struct option long_opts[] = {
 	{ "numeric", 0, 0, 'n' },
 	{ "resolve", 0, 0, 'r' },
@@ -4017,6 +4180,7 @@ static const struct option long_opts[] = {
 	{ "udp", 0, 0, 'u' },
 	{ "raw", 0, 0, 'w' },
 	{ "unix", 0, 0, 'x' },
+	{ "vsock", 0, 0, OPT_VSOCK },
 	{ "all", 0, 0, 'a' },
 	{ "listening", 0, 0, 'l' },
 	{ "ipv4", 0, 0, '4' },
@@ -4102,6 +4266,9 @@ int main(int argc, char *argv[])
 		case 'x':
 			filter_af_set(&current_filter, AF_UNIX);
 			break;
+		case OPT_VSOCK:
+			filter_af_set(&current_filter, AF_VSOCK);
+			break;
 		case 'a':
 			state_filter = SS_ALL;
 			break;
@@ -4128,6 +4295,8 @@ int main(int argc, char *argv[])
 				filter_af_set(&current_filter, AF_UNIX);
 			else if (strcmp(optarg, "netlink") == 0)
 				filter_af_set(&current_filter, AF_NETLINK);
+			else if (strcmp(optarg, "vsock") == 0)
+				filter_af_set(&current_filter, AF_VSOCK);
 			else if (strcmp(optarg, "help") == 0)
 				help();
 			else {
@@ -4193,6 +4362,15 @@ int main(int argc, char *argv[])
 					filter_db_set(&current_filter, PACKET_DG_DB);
 				} else if (strcmp(p, "netlink") == 0) {
 					filter_db_set(&current_filter, NETLINK_DB);
+				} else if (strcmp(p, "vsock") == 0) {
+					filter_db_set(&current_filter, VSOCK_ST_DB);
+					filter_db_set(&current_filter, VSOCK_DG_DB);
+				} else if (strcmp(p, "vsock_stream") == 0 ||
+					   strcmp(p, "v_str") == 0) {
+					filter_db_set(&current_filter, VSOCK_ST_DB);
+				} else if (strcmp(p, "vsock_dgram") == 0 ||
+					   strcmp(p, "v_dgr") == 0) {
+					filter_db_set(&current_filter, VSOCK_DG_DB);
 				} else {
 					fprintf(stderr, "ss: \"%s\" is illegal socket table id\n", p);
 					usage();
@@ -4408,6 +4586,8 @@ int main(int argc, char *argv[])
 		dccp_show(&current_filter);
 	if (current_filter.dbs & (1<<SCTP_DB))
 		sctp_show(&current_filter);
+	if (current_filter.dbs & VSOCK_DBM)
+		vsock_show(&current_filter);
 
 	if (show_users || show_proc_ctx || show_sock_ctx)
 		user_ent_destroy();
diff --git a/man/man8/ss.8 b/man/man8/ss.8
index 3bec97f0..3af509e9 100644
--- a/man/man8/ss.8
+++ b/man/man8/ss.8
@@ -125,14 +125,18 @@ Display Unix domain sockets (alias for -f unix).
 .B \-S, \-\-sctp
 Display SCTP sockets.
 .TP
+.B \-\-vsock
+Display vsock sockets (alias for -f vsock).
+.TP
 .B \-f FAMILY, \-\-family=FAMILY
 Display sockets of type FAMILY.
-Currently the following families are supported: unix, inet, inet6, link, netlink.
+Currently the following families are supported: unix, inet, inet6, link, netlink, vsock.
 .TP
 .B \-A QUERY, \-\-query=QUERY, \-\-socket=QUERY
 List of socket tables to dump, separated by commas. The following identifiers
 are understood: all, inet, tcp, udp, raw, unix, packet, netlink, unix_dgram,
-unix_stream, unix_seqpacket, packet_raw, packet_dgram, dccp, sctp.
+unix_stream, unix_seqpacket, packet_raw, packet_dgram, dccp, sctp,
+vsock_stream, vsock_dgram.
 .TP
 .B \-D FILE, \-\-diag=FILE
 Do not display anything, just dump raw information about TCP sockets to FILE after applying filters. If FILE is - stdout is used.
-- 
2.13.6

^ permalink raw reply related

* [PATCH iproute2 2/3] include: add <linux/vm_sockets_diag.h>
From: Stefan Hajnoczi @ 2017-10-03 17:57 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Jorgen Hansen, Dexuan Cui, Stefan Hajnoczi
In-Reply-To: <20171003175744.24987-1-stefanha@redhat.com>

This new Linux header file defines the sock_diag interface used by
AF_VSOCK.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/linux/vm_sockets_diag.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 include/linux/vm_sockets_diag.h

diff --git a/include/linux/vm_sockets_diag.h b/include/linux/vm_sockets_diag.h
new file mode 100644
index 00000000..14cd7dc5
--- /dev/null
+++ b/include/linux/vm_sockets_diag.h
@@ -0,0 +1,33 @@
+/* AF_VSOCK sock_diag(7) interface for querying open sockets */
+
+#ifndef _UAPI__VM_SOCKETS_DIAG_H__
+#define _UAPI__VM_SOCKETS_DIAG_H__
+
+#include <linux/types.h>
+
+/* Request */
+struct vsock_diag_req {
+	__u8	sdiag_family;	/* must be AF_VSOCK */
+	__u8	sdiag_protocol;	/* must be 0 */
+	__u16	pad;		/* must be 0 */
+	__u32	vdiag_states;	/* query bitmap (e.g. 1 << TCP_LISTEN) */
+	__u32	vdiag_ino;	/* must be 0 (reserved) */
+	__u32	vdiag_show;	/* must be 0 (reserved) */
+	__u32	vdiag_cookie[2];
+};
+
+/* Response */
+struct vsock_diag_msg {
+	__u8	vdiag_family;	/* AF_VSOCK */
+	__u8	vdiag_type;	/* SOCK_STREAM or SOCK_DGRAM */
+	__u8	vdiag_state;	/* sk_state (e.g. TCP_LISTEN) */
+	__u8	vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */
+	__u32   vdiag_src_cid;
+	__u32   vdiag_src_port;
+	__u32   vdiag_dst_cid;
+	__u32   vdiag_dst_port;
+	__u32	vdiag_ino;
+	__u32	vdiag_cookie[2];
+};
+
+#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */
-- 
2.13.6

^ permalink raw reply related

* [PATCH iproute2 1/3] ss: allow AF_FAMILY constants >32
From: Stefan Hajnoczi @ 2017-10-03 17:57 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Jorgen Hansen, Dexuan Cui, Stefan Hajnoczi
In-Reply-To: <20171003175744.24987-1-stefanha@redhat.com>

Linux has more than 32 address families defined in <bits/socket.h>.  Use
a 64-bit type so all of them can be represented in the filter->families
bitmask.

It's easy to introduce bugs when using (1 << AF_FAMILY) because the
value is 32-bit.  This can produce incorrect results from bitmask
operations so introduce the FAMILY_MASK() macro to eliminate these bugs.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 misc/ss.c | 54 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index dd8dfaa4..12a31c90 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -170,55 +170,57 @@ enum {
 struct filter {
 	int dbs;
 	int states;
-	int families;
+	__u64 families;
 	struct ssfilter *f;
 	bool kill;
 };
 
+#define FAMILY_MASK(family) ((__u64)1 << (family))
+
 static const struct filter default_dbs[MAX_DB] = {
 	[TCP_DB] = {
 		.states   = SS_CONN,
-		.families = (1 << AF_INET) | (1 << AF_INET6),
+		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
 	[DCCP_DB] = {
 		.states   = SS_CONN,
-		.families = (1 << AF_INET) | (1 << AF_INET6),
+		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
 	[UDP_DB] = {
 		.states   = (1 << SS_ESTABLISHED),
-		.families = (1 << AF_INET) | (1 << AF_INET6),
+		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
 	[RAW_DB] = {
 		.states   = (1 << SS_ESTABLISHED),
-		.families = (1 << AF_INET) | (1 << AF_INET6),
+		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
 	[UNIX_DG_DB] = {
 		.states   = (1 << SS_CLOSE),
-		.families = (1 << AF_UNIX),
+		.families = FAMILY_MASK(AF_UNIX),
 	},
 	[UNIX_ST_DB] = {
 		.states   = SS_CONN,
-		.families = (1 << AF_UNIX),
+		.families = FAMILY_MASK(AF_UNIX),
 	},
 	[UNIX_SQ_DB] = {
 		.states   = SS_CONN,
-		.families = (1 << AF_UNIX),
+		.families = FAMILY_MASK(AF_UNIX),
 	},
 	[PACKET_DG_DB] = {
 		.states   = (1 << SS_CLOSE),
-		.families = (1 << AF_PACKET),
+		.families = FAMILY_MASK(AF_PACKET),
 	},
 	[PACKET_R_DB] = {
 		.states   = (1 << SS_CLOSE),
-		.families = (1 << AF_PACKET),
+		.families = FAMILY_MASK(AF_PACKET),
 	},
 	[NETLINK_DB] = {
 		.states   = (1 << SS_CLOSE),
-		.families = (1 << AF_NETLINK),
+		.families = FAMILY_MASK(AF_NETLINK),
 	},
 	[SCTP_DB] = {
 		.states   = SS_CONN,
-		.families = (1 << AF_INET) | (1 << AF_INET6),
+		.families = FAMILY_MASK(AF_INET) | FAMILY_MASK(AF_INET6),
 	},
 };
 
@@ -258,14 +260,14 @@ static void filter_db_set(struct filter *f, int db)
 static void filter_af_set(struct filter *f, int af)
 {
 	f->states	   |= default_afs[af].states;
-	f->families	   |= 1 << af;
+	f->families	   |= FAMILY_MASK(af);
 	do_default	    = 0;
 	preferred_family    = af;
 }
 
 static int filter_af_get(struct filter *f, int af)
 {
-	return f->families & (1 << af);
+	return !!(f->families & FAMILY_MASK(af));
 }
 
 static void filter_default_dbs(struct filter *f)
@@ -302,7 +304,7 @@ static void filter_merge_defaults(struct filter *f)
 			f->families |= default_dbs[db].families;
 	}
 	for (af = 0; af < AF_MAX; af++) {
-		if (!(f->families & (1 << af)))
+		if (!(f->families & FAMILY_MASK(af)))
 			continue;
 
 		if (!(default_afs[af].dbs & f->dbs))
@@ -2608,7 +2610,7 @@ static int show_one_inet_sock(const struct sockaddr_nl *addr,
 	struct inet_diag_msg *r = NLMSG_DATA(h);
 	struct sockstat s = {};
 
-	if (!(diag_arg->f->families & (1 << r->idiag_family)))
+	if (!(diag_arg->f->families & FAMILY_MASK(r->idiag_family)))
 		return 0;
 
 	parse_diag_msg(h, &s);
@@ -2802,7 +2804,7 @@ static int tcp_show(struct filter *f)
 		return -1;
 	}
 
-	if (f->families & (1<<AF_INET)) {
+	if (f->families & FAMILY_MASK(AF_INET)) {
 		if ((fp = net_tcp_open()) == NULL)
 			goto outerr;
 
@@ -2812,7 +2814,7 @@ static int tcp_show(struct filter *f)
 		fclose(fp);
 	}
 
-	if ((f->families & (1<<AF_INET6)) &&
+	if ((f->families & FAMILY_MASK(AF_INET6)) &&
 	    (fp = net_tcp6_open()) != NULL) {
 		setbuffer(fp, buf, bufsize);
 		if (generic_record_read(fp, tcp_show_line, f, AF_INET6))
@@ -2911,7 +2913,7 @@ static int udp_show(struct filter *f)
 	    && inet_show_netlink(f, NULL, IPPROTO_UDP) == 0)
 		return 0;
 
-	if (f->families&(1<<AF_INET)) {
+	if (f->families&FAMILY_MASK(AF_INET)) {
 		if ((fp = net_udp_open()) == NULL)
 			goto outerr;
 		if (generic_record_read(fp, dgram_show_line, f, AF_INET))
@@ -2919,7 +2921,7 @@ static int udp_show(struct filter *f)
 		fclose(fp);
 	}
 
-	if ((f->families&(1<<AF_INET6)) &&
+	if ((f->families&FAMILY_MASK(AF_INET6)) &&
 	    (fp = net_udp6_open()) != NULL) {
 		if (generic_record_read(fp, dgram_show_line, f, AF_INET6))
 			goto outerr;
@@ -2951,7 +2953,7 @@ static int raw_show(struct filter *f)
 	    inet_show_netlink(f, NULL, IPPROTO_RAW) == 0)
 		return 0;
 
-	if (f->families&(1<<AF_INET)) {
+	if (f->families&FAMILY_MASK(AF_INET)) {
 		if ((fp = net_raw_open()) == NULL)
 			goto outerr;
 		if (generic_record_read(fp, dgram_show_line, f, AF_INET))
@@ -2959,7 +2961,7 @@ static int raw_show(struct filter *f)
 		fclose(fp);
 	}
 
-	if ((f->families&(1<<AF_INET6)) &&
+	if ((f->families&FAMILY_MASK(AF_INET6)) &&
 	    (fp = net_raw6_open()) != NULL) {
 		if (generic_record_read(fp, dgram_show_line, f, AF_INET6))
 			goto outerr;
@@ -3703,13 +3705,13 @@ static int handle_follow_request(struct filter *f)
 	int groups = 0;
 	struct rtnl_handle rth;
 
-	if (f->families & (1 << AF_INET) && f->dbs & (1 << TCP_DB))
+	if (f->families & FAMILY_MASK(AF_INET) && f->dbs & (1 << TCP_DB))
 		groups |= 1 << (SKNLGRP_INET_TCP_DESTROY - 1);
-	if (f->families & (1 << AF_INET) && f->dbs & (1 << UDP_DB))
+	if (f->families & FAMILY_MASK(AF_INET) && f->dbs & (1 << UDP_DB))
 		groups |= 1 << (SKNLGRP_INET_UDP_DESTROY - 1);
-	if (f->families & (1 << AF_INET6) && f->dbs & (1 << TCP_DB))
+	if (f->families & FAMILY_MASK(AF_INET6) && f->dbs & (1 << TCP_DB))
 		groups |= 1 << (SKNLGRP_INET6_TCP_DESTROY - 1);
-	if (f->families & (1 << AF_INET6) && f->dbs & (1 << UDP_DB))
+	if (f->families & FAMILY_MASK(AF_INET6) && f->dbs & (1 << UDP_DB))
 		groups |= 1 << (SKNLGRP_INET6_UDP_DESTROY - 1);
 
 	if (groups == 0)
-- 
2.13.6

^ permalink raw reply related

* [RFC 2/2] tools: bpftool: use the kernel's instruction printer
From: Jakub Kicinski @ 2017-10-03 17:57 UTC (permalink / raw)
  To: daniel, dsahern, alexei.starovoitov
  Cc: netdev, oss-drivers, david.beckett, Jakub Kicinski
In-Reply-To: <20171003175746.30145-1-jakub.kicinski@netronome.com>

Compile the instruction printer from kernel/bpf and use it
for disassembling "translated" eBPF code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.txt | 11 +++---
 tools/bpf/bpftool/Makefile                       |  7 ++--
 tools/bpf/bpftool/main.h                         | 10 ++----
 tools/bpf/bpftool/prog.c                         | 44 +++++++++++++++++++-----
 4 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.txt b/tools/bpf/bpftool/Documentation/bpftool-prog.txt
index 79791bf11e4d..7f33260f5683 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.txt
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.txt
@@ -11,7 +11,7 @@ SYNOPSIS
 ========
 
 |	**bpftool** prog show [*PROG*]
-|	**bpftool** prog dump xlated *PROG*  file *FILE*
+|	**bpftool** prog dump xlated *PROG* [file *FILE*] [opcodes]
 |	**bpftool** prog dump jited  *PROG* [file *FILE*] [opcodes]
 |	**bpftool** prog pin *PROG* *FILE*
 |	**bpftool** prog help
@@ -28,9 +28,12 @@ DESCRIPTION
 		  Output will start with program ID followed by program type and
 		  zero or more named attributes (depending on kernel version).
 
-	**bpftool prog dump xlated** *PROG*  **file** *FILE*
-		  Dump eBPF instructions of the program from the kernel to a
-		  file.
+	**bpftool prog dump xlated** *PROG* [**file** *FILE*] [**opcodes**]
+		  Dump eBPF instructions of the program from the kernel.
+		  If *FILE* is specified image will be written to a file,
+		  otherwise it will be disassembled and printed to stdout.
+
+		  **opcodes** controls if raw opcodes will be printed.
 
 	**bpftool prog dump jited**  *PROG* [**file** *FILE*] [**opcodes**]
 		  Dump jited image (host machine code) of the program.
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 8705ee44664d..4f339824ca57 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -51,7 +51,7 @@ CC = gcc
 
 CFLAGS += -O2
 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
-CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 
 include $(wildcard *.d)
@@ -59,7 +59,10 @@ include $(wildcard *.d)
 all: $(OUTPUT)bpftool
 
 SRCS=$(wildcard *.c)
-OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS))
+OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
+
+$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
+	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
 	$(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 85d2d7870a58..d7dd9965f40a 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -36,11 +36,12 @@
 #ifndef __BPF_TOOL_H
 #define __BPF_TOOL_H
 
+/* BDF and kernel.h both define GCC_VERSION, differently */
+#undef GCC_VERSION
 #include <stdbool.h>
 #include <stdio.h>
 #include <linux/bpf.h>
-
-#define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
+#include <linux/kernel.h>
 
 #define err(msg...)	fprintf(stderr, "Error: " msg)
 #define warn(msg...)	fprintf(stderr, "Warning: " msg)
@@ -48,11 +49,6 @@
 
 #define ptr_to_u64(ptr)	((__u64)(unsigned long)(ptr))
 
-#define min(a, b)							\
-	({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; })
-#define max(a, b)							\
-	({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; })
-
 #define NEXT_ARG()	({ argc--; argv++; if (argc < 0) usage(); })
 #define NEXT_ARGP()	({ (*argc)--; (*argv)++; if (*argc < 0) usage(); })
 #define BAD_ARG()	({ err("what is '%s'?\n", *argv); -1; })
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 421ba89ce86a..2cdc8f41809f 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -35,6 +35,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -46,6 +47,7 @@
 #include <bpf.h>
 
 #include "main.h"
+#include "disasm.h"
 
 static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_UNSPEC]		= "unspec",
@@ -297,11 +299,39 @@ static int do_show(int argc, char **argv)
 	return 0;
 }
 
+static void print_insn(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vprintf(fmt, args);
+	va_end(args);
+}
+
+static void dump_xlated(void *buf, unsigned int len, bool opcodes)
+{
+	struct bpf_insn *insn = buf;
+	unsigned int i;
+
+	for (i = 0; i < len / sizeof(*insn); i++) {
+		printf("% 4d: ", i);
+		print_bpf_insn(print_insn, insn + i, true);
+
+		if (opcodes) {
+			printf("       ");
+			print_hex(insn + i, 8, " ");
+			printf("\n");
+		}
+
+		if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW))
+			i++;
+	}
+}
+
 static int do_dump(int argc, char **argv)
 {
 	struct bpf_prog_info info = {};
 	__u32 len = sizeof(info);
-	bool can_disasm = false;
 	unsigned int buf_size;
 	char *filepath = NULL;
 	bool opcodes = false;
@@ -315,7 +345,6 @@ static int do_dump(int argc, char **argv)
 	if (is_prefix(*argv, "jited")) {
 		member_len = &info.jited_prog_len;
 		member_ptr = &info.jited_prog_insns;
-		can_disasm = true;
 	} else if (is_prefix(*argv, "xlated")) {
 		member_len = &info.xlated_prog_len;
 		member_ptr = &info.xlated_prog_insns;
@@ -346,10 +375,6 @@ static int do_dump(int argc, char **argv)
 		NEXT_ARG();
 	}
 
-	if (!filepath && !can_disasm) {
-		err("expected 'file' got %s\n", *argv);
-		return -1;
-	}
 	if (argc) {
 		usage();
 		return -1;
@@ -409,7 +434,10 @@ static int do_dump(int argc, char **argv)
 			goto err_free;
 		}
 	} else {
-		disasm_print_insn(buf, *member_len, opcodes);
+		if (member_len == &info.jited_prog_len)
+			disasm_print_insn(buf, *member_len, opcodes);
+		else
+			dump_xlated(buf, *member_len, opcodes);
 	}
 
 	free(buf);
@@ -430,7 +458,7 @@ static int do_help(int argc, char **argv)
 {
 	fprintf(stderr,
 		"Usage: %s %s show [PROG]\n"
-		"       %s %s dump xlated PROG  file FILE\n"
+		"       %s %s dump xlated PROG [file FILE] [opcodes]\n"
 		"       %s %s dump jited  PROG [file FILE] [opcodes]\n"
 		"       %s %s pin   PROG FILE\n"
 		"       %s %s help\n"
-- 
2.14.1

^ permalink raw reply related

* [RFC 1/2] bpf: move instruction printing into a separate file
From: Jakub Kicinski @ 2017-10-03 17:57 UTC (permalink / raw)
  To: daniel, dsahern, alexei.starovoitov
  Cc: netdev, oss-drivers, david.beckett, Jakub Kicinski
In-Reply-To: <59D3B456.4020209@iogearbox.net>

Separate the instruction printing into a standalone source file.
This way sneaky code from tools/ can use it directly.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
Like this?

 kernel/bpf/Makefile   |   1 +
 kernel/bpf/disasm.c   | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/disasm.h   |  29 +++++++
 kernel/bpf/verifier.c | 200 +----------------------------------------------
 4 files changed, 245 insertions(+), 197 deletions(-)
 create mode 100644 kernel/bpf/disasm.c
 create mode 100644 kernel/bpf/disasm.h

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 897daa005b23..53fb09f92e3f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..f2194fa442a4
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,212 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+		return func_id_str[id];
+	else
+		return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JLT >> 4]  = "<",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JLE >> 4]  = "<=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSLT >> 4] = "s<",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_JSLE >> 4] = "s<=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(void (*verbose)(const char *, ...),
+			       const struct bpf_insn *insn)
+{
+	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(void (*verbose)(const char *, ...),
+		    const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose("BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(verbose, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose("(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose("(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		} else {
+			verbose("(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+		}
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose("BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM &&
+			   BPF_SIZE(insn->code) == BPF_DW) {
+			/* At this point, we already made sure that the second
+			 * part of the ldimm64 insn is accessible.
+			 */
+			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+			if (map_ptr && allow_ptr_leaks)
+				imm = 0;
+
+			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+				insn->dst_reg, (unsigned long long)imm);
+		} else {
+			verbose("BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose("(%02x) call %s#%d\n", insn->code,
+				func_id_name(insn->imm), insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose("(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose("(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+	}
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..fa5637334bb1
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+void print_bpf_insn(void (*verbose)(const char *, ...),
+		    const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cf9b72c59a0..1e40b5d41b9e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,8 @@
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
 
+#include "disasm.h"
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -197,22 +199,6 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-		return func_id_str[id];
-	else
-		return "unknown";
-}
-
 static void print_verifier_state(struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
@@ -280,186 +266,6 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 	verbose("\n");
 }
 
-static const char *const bpf_class_string[] = {
-	[BPF_LD]    = "ld",
-	[BPF_LDX]   = "ldx",
-	[BPF_ST]    = "st",
-	[BPF_STX]   = "stx",
-	[BPF_ALU]   = "alu",
-	[BPF_JMP]   = "jmp",
-	[BPF_RET]   = "BUG",
-	[BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-	[BPF_ADD >> 4]  = "+=",
-	[BPF_SUB >> 4]  = "-=",
-	[BPF_MUL >> 4]  = "*=",
-	[BPF_DIV >> 4]  = "/=",
-	[BPF_OR  >> 4]  = "|=",
-	[BPF_AND >> 4]  = "&=",
-	[BPF_LSH >> 4]  = "<<=",
-	[BPF_RSH >> 4]  = ">>=",
-	[BPF_NEG >> 4]  = "neg",
-	[BPF_MOD >> 4]  = "%=",
-	[BPF_XOR >> 4]  = "^=",
-	[BPF_MOV >> 4]  = "=",
-	[BPF_ARSH >> 4] = "s>>=",
-	[BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-	[BPF_W >> 3]  = "u32",
-	[BPF_H >> 3]  = "u16",
-	[BPF_B >> 3]  = "u8",
-	[BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-	[BPF_JA >> 4]   = "jmp",
-	[BPF_JEQ >> 4]  = "==",
-	[BPF_JGT >> 4]  = ">",
-	[BPF_JLT >> 4]  = "<",
-	[BPF_JGE >> 4]  = ">=",
-	[BPF_JLE >> 4]  = "<=",
-	[BPF_JSET >> 4] = "&",
-	[BPF_JNE >> 4]  = "!=",
-	[BPF_JSGT >> 4] = "s>",
-	[BPF_JSLT >> 4] = "s<",
-	[BPF_JSGE >> 4] = "s>=",
-	[BPF_JSLE >> 4] = "s<=",
-	[BPF_CALL >> 4] = "call",
-	[BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
-			       const struct bpf_insn *insn)
-{
-	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
-		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
-		insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(const struct bpf_verifier_env *env,
-			   const struct bpf_insn *insn)
-{
-	u8 class = BPF_CLASS(insn->code);
-
-	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_OP(insn->code) == BPF_END) {
-			if (class == BPF_ALU64)
-				verbose("BUG_alu64_%02x\n", insn->code);
-			else
-				print_bpf_end_insn(env, insn);
-		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose("(%02x) r%d = %s-r%d\n",
-				insn->code, insn->dst_reg,
-				class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) %sr%d %s %sr%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->src_reg);
-		} else {
-			verbose("(%02x) %sr%d %s %s%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->imm);
-		}
-	} else if (class == BPF_STX) {
-		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg,
-				insn->off, insn->src_reg);
-		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg, insn->off,
-				insn->src_reg);
-		else
-			verbose("BUG_%02x\n", insn->code);
-	} else if (class == BPF_ST) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
-			return;
-		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
-			insn->code,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->dst_reg,
-			insn->off, insn->imm);
-	} else if (class == BPF_LDX) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
-			return;
-		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
-			insn->code, insn->dst_reg,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->src_reg, insn->off);
-	} else if (class == BPF_LD) {
-		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->src_reg, insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IMM &&
-			   BPF_SIZE(insn->code) == BPF_DW) {
-			/* At this point, we already made sure that the second
-			 * part of the ldimm64 insn is accessible.
-			 */
-			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
-			if (map_ptr && !env->allow_ptr_leaks)
-				imm = 0;
-
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
-				insn->dst_reg, (unsigned long long)imm);
-		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
-			return;
-		}
-	} else if (class == BPF_JMP) {
-		u8 opcode = BPF_OP(insn->code);
-
-		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
-				func_id_name(insn->imm), insn->imm);
-		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
-				insn->code, insn->off);
-		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->src_reg, insn->off);
-		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->imm, insn->off);
-		}
-	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
-	}
-}
-
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
 {
 	struct bpf_verifier_stack_elem *elem;
@@ -3693,7 +3499,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (log_level) {
 			verbose("%d: ", insn_idx);
-			print_bpf_insn(env, insn);
+			print_bpf_insn(verbose, insn, env->allow_ptr_leaks);
 		}
 
 		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
-- 
2.14.1

^ permalink raw reply related

* [PATCH iproute2 0/3] ss: add AF_VSOCK support
From: Stefan Hajnoczi @ 2017-10-03 17:57 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Jorgen Hansen, Dexuan Cui, Stefan Hajnoczi

This patch series adds AF_VSOCK support to ss(8).  AF_VSOCK is a host<->guest
communications channel supported by VMware, KVM (virtio-vsock), and Hyper-V.

To dump AF_VSOCK sockets:

  $ ss --vsock

The vsock_diag.ko kernel module for Linux was posted in "[PATCH 0/5] VSOCK: add
sock_diag interface".  That patch series also adds the
<linux/vm_sockets_diag.h> header file that this patch series relies on.  The
Linux patches are also available here:

  https://github.com/stefanha/linux/commits/vsock-diag

Stefan Hajnoczi (3):
  ss: allow AF_FAMILY constants >32
  include: add <linux/vm_sockets_diag.h>
  ss: add AF_VSOCK support

 include/linux/vm_sockets_diag.h |  33 ++++++
 misc/ss.c                       | 238 +++++++++++++++++++++++++++++++++++-----
 man/man8/ss.8                   |   8 +-
 3 files changed, 249 insertions(+), 30 deletions(-)
 create mode 100644 include/linux/vm_sockets_diag.h

-- 
2.13.6

^ permalink raw reply

* RE: [PATCH] net: phy: DP83822 initial driver submission
From: Woojung.Huh @ 2017-10-03 17:43 UTC (permalink / raw)
  To: dmurphy, andrew, f.fainelli; +Cc: netdev
In-Reply-To: <20171003155316.12312-1-dmurphy@ti.com>

> +static int dp83822_suspend(struct phy_device *phydev)
> +{
> +	int value;
> +
> +	mutex_lock(&phydev->lock);
> +
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR,
> MII_DP83822_WOL_CFG);
> +	if (~value & DP83822_WOL_EN) {
Same result, but how about " if (!(value & DP83822_WOL_EN))" ?

> +		value = phy_read(phydev, MII_BMCR);
> +		phy_write(phydev, MII_BMCR, value | BMCR_PDOWN);
> +	}
> +
> +	mutex_unlock(&phydev->lock);
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [PATCH 2/2 net-next] mlxsw: spectrum: Add missing error code on allocation failure
From: David Miller @ 2017-10-03 17:27 UTC (permalink / raw)
  To: dan.carpenter; +Cc: jiri, yotamg, idosch, netdev, kernel-janitors
In-Reply-To: <20171003105340.llwk5oajgrohbksu@mwanda>

From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 3 Oct 2017 13:53:41 +0300

> We accidentally return success if the kmalloc_array() call fails.
> 
> Fixes: 0e14c7777acb ("mlxsw: spectrum: Add the multicast routing hardware logic")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Applied.

^ permalink raw reply

* Re: [PATCH 1/2 net-next] mlxsw: spectrum: Fix check for IS_ERR() instead of NULL
From: David Miller @ 2017-10-03 17:27 UTC (permalink / raw)
  To: dan.carpenter; +Cc: jiri, yotamg, idosch, netdev, kernel-janitors
In-Reply-To: <20171003105303.u7yrzxknddmmerol@mwanda>

From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 3 Oct 2017 13:53:03 +0300

> mlxsw_afa_block_create() doesn't return error pointers, it returns NULL
> on error.
> 
> Fixes: 0e14c7777acb ("mlxsw: spectrum: Add the multicast routing hardware logic")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: dsa: mt7530: make functions mt7530_phy_write static
From: David Miller @ 2017-10-03 17:20 UTC (permalink / raw)
  To: colin.king
  Cc: andrew, vivien.didelot, f.fainelli, netdev, kernel-janitors,
	linux-kernel
In-Reply-To: <20171003104633.27151-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Tue,  3 Oct 2017 11:46:33 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> The function mt7530_phy_write is local to the source and does not need to
> be in global scope, so make it static.
> 
> Cleans up sparse warnings:
> symbol 'mt7530_phy_write' was not declared. Should it be static?
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: dsa: lan9303: make functions lan9303_mdio_phy_{read|write} static
From: David Miller @ 2017-10-03 17:20 UTC (permalink / raw)
  To: colin.king
  Cc: andrew, vivien.didelot, f.fainelli, netdev, kernel-janitors,
	linux-kernel
In-Reply-To: <20171003103918.26934-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Tue,  3 Oct 2017 11:39:18 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> The functions lan9303_mdio_phy_write and lan9303_mdio_phy_read are local
> to the source and do not need to be in global scope, so make them static.
> 
> Cleans up sparse warnings:
> symbol 'lan9303_mdio_phy_write' was not declared. Should it be static?
> symbol 'lan9303_mdio_phy_read' was not declared. Should it be static?
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied.

^ permalink raw reply

* Re: [PATCH net] net: rtnetlink: fix info leak in RTM_GETSTATS call
From: David Miller @ 2017-10-03 17:19 UTC (permalink / raw)
  To: nikolay; +Cc: netdev, keescook, dvyukov, andreyknvl, kcc, roopa, glider,
	edumazet
In-Reply-To: <1507026048-13734-1-git-send-email-nikolay@cumulusnetworks.com>

From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue,  3 Oct 2017 13:20:48 +0300

> When RTM_GETSTATS was added the fields of its header struct were not all
> initialized when returning the result thus leaking 4 bytes of information
> to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks
> to Alexander Potapenko for the detailed report and bisection.
> 
> Reported-by: Alexander Potapenko <glider@google.com>
> Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats")
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH v2 net-next 06/12] qed: Add LL2 slowpath handling
From: David Miller @ 2017-10-03 17:17 UTC (permalink / raw)
  To: Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	Ariel.Elior-YGCgFSpz5w/QT0dZR+AlfA
In-Reply-To: <1507020902-4952-7-git-send-email-Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>

From: Michal Kalderon <Michal.Kalderon-YGCgFSpz5w/QT0dZR+AlfA@public.gmane.org>
Date: Tue, 3 Oct 2017 11:54:56 +0300

> @@ -423,6 +423,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
>  }
>  
>  static int
> +qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
> +			struct qed_ll2_info *p_ll2_conn,
> +			union core_rx_cqe_union *p_cqe,
> +			unsigned long *p_lock_flags)
> +{
 ...
> +	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
> +

You can't drop this lock.

Another thread can enter the loop of our caller and process RX queue
entries, then we would return from here and try to process the same
entries again.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net: phy: DP83822 initial driver submission
From: Florian Fainelli @ 2017-10-03 17:15 UTC (permalink / raw)
  To: Dan Murphy, andrew; +Cc: netdev
In-Reply-To: <20171003155316.12312-1-dmurphy@ti.com>

On 10/03/2017 08:53 AM, Dan Murphy wrote:
> Add support for the TI  DP83822 10/100Mbit ethernet phy.
> 
> The DP83822 provides flexibility to connect to a MAC through a
> standard MII, RMII or RGMII interface.
> 
> Datasheet:
> http://www.ti.com/product/DP83822I/datasheet

This looks pretty good, just a few nits below.

[snip]

> +static int dp83822_set_wol(struct phy_device *phydev,
> +			   struct ethtool_wolinfo *wol)
> +{
> +	struct net_device *ndev = phydev->attached_dev;
> +	u16 value;
> +	const u8 *mac;
> +
> +	if (wol->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) {
> +		mac = (const u8 *)ndev->dev_addr;
> +
> +		if (!is_valid_ether_addr(mac))
> +			return -EFAULT;

-EINVAL maybe?

> +
> +		/* MAC addresses start with byte 5, but stored in mac[0].
> +		 * 822 PHYs store bytes 4|5, 2|3, 0|1
> +		 */
> +		phy_write_mmd(phydev, DP83822_DEVADDR,
> +			      MII_DP83822_WOL_DA1, (mac[1] << 8) | mac[0]);
> +		phy_write_mmd(phydev, DP83822_DEVADDR,
> +			      MII_DP83822_WOL_DA2, (mac[3] << 8) | mac[2]);
> +		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA3,
> +			      (mac[5] << 8) | mac[4]);
> +
> +		value = phy_read_mmd(phydev, DP83822_DEVADDR,
> +				     MII_DP83822_WOL_CFG);
> +		if (wol->wolopts & WAKE_MAGIC)
> +			value |= DP83822_WOL_MAGIC_EN;
> +		else
> +			value &= ~DP83822_WOL_MAGIC_EN;
> +
> +		if (wol->wolopts & WAKE_MAGICSECURE) {
> +			value |= DP83822_WOL_SECURE_ON;

Just in case any of the writes below fail, you would probably want to
set this bit last, thus indicating that the password was successfully set.

> +			phy_write_mmd(phydev, DP83822_DEVADDR,
> +				      MII_DP83822_RXSOP1,
> +				      (wol->sopass[1] << 8) | wol->sopass[0]);
> +			phy_write_mmd(phydev, DP83822_DEVADDR,
> +				      MII_DP83822_RXSOP2,
> +				      (wol->sopass[3] << 8) | wol->sopass[2]);
> +			phy_write_mmd(phydev, DP83822_DEVADDR,
> +				      MII_DP83822_RXSOP3,
> +				      (wol->sopass[5] << 8) | wol->sopass[4]);

In the else clause, you don't appear to be clearing the MagicPacket
SecureOn password, but your get_wol function does not check for
DP83822_WOL_SECURE_ON before returning the secure password, so either
one of these two should be fixed. I would go with fixing the get_wol
function see below.

> +		} else {
> +			value &= ~DP83822_WOL_SECURE_ON;
> +		}
> +
> +		value |= (DP83822_WOL_EN | DP83822_WOL_CLR_INDICATION |
> +			  DP83822_WOL_CLR_INDICATION);

The extra parenthesis should not be required here.

> +		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG,
> +			      value);
> +	} else {
> +		value =
> +		    phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
> +		value &= (~DP83822_WOL_EN);

Same here, parenthesis should not be needed.

> +		phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG,
> +			      value);
> +	}
> +
> +	return 0;
> +}
> +
> +static void dp83822_get_wol(struct phy_device *phydev,
> +			    struct ethtool_wolinfo *wol)
> +{
> +	int value;
> +
> +	wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE);
> +	wol->wolopts = 0;
> +
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
> +	if (value & DP83822_WOL_MAGIC_EN)
> +		wol->wolopts |= WAKE_MAGIC;
> +
> +	if (value & DP83822_WOL_SECURE_ON)
> +		wol->wolopts |= WAKE_MAGICSECURE;
> +
> +	if (~value & DP83822_WOL_CLR_INDICATION)
> +		wol->wolopts = 0;
> +
> +	wol->sopass[0] = (phy_read_mmd(phydev,
> +				       DP83822_DEVADDR,
> +				       MII_DP83822_RXSOP1) & 0xFF);
> +	wol->sopass[1] =
> +	    (phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP1) >> 8);

You can save about twice the amount of reads by using a temporary
variable to hold the 16-bit register ;)

> +	wol->sopass[2] =
> +	    (phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP2) & 0xFF);
> +	wol->sopass[3] =
> +	    (phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP2) >> 8);
> +	wol->sopass[4] =
> +	    (phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP3) & 0xFF);
> +	wol->sopass[5] =
> +	    (phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RXSOP3) >> 8);

Unless DP83822_WOL_SECURE_ON is set, you probably should not try reading
the password at all, because there is no guarantee it has been correctly
set.

> +}

> +static int dp83822_phy_reset(struct phy_device *phydev)
> +{
> +	int err;
> +
> +	err = phy_write(phydev, MII_DP83822_RESET_CTRL, DP83822_HW_RESET);
> +	if (err < 0)
> +		return err;
> +
> +	return 0;
> +}
> +
> +static int dp83822_suspend(struct phy_device *phydev)
> +{
> +	int value;
> +
> +	mutex_lock(&phydev->lock);
> +
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
> +	if (~value & DP83822_WOL_EN) {
> +		value = phy_read(phydev, MII_BMCR);
> +		phy_write(phydev, MII_BMCR, value | BMCR_PDOWN);
> +	}

Can you use genphy_suspend() here along with careful locking of course?

> +
> +	mutex_unlock(&phydev->lock);
> +
> +	return 0;
> +}
> +
> +static int dp83822_resume(struct phy_device *phydev)
> +{
> +	int value;
> +
> +	mutex_lock(&phydev->lock);
> +
> +	value = phy_read(phydev, MII_BMCR);
> +	phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN);

And genphy_resume() here as well?

> +
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG);
> +
> +	phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, value |
> +		      DP83822_WOL_CLR_INDICATION);
> +
> +	mutex_unlock(&phydev->lock);
> +
> +	return 0;
> +}
> +
> +static struct phy_driver dp83822_driver[] = {
> +	{
> +	 .phy_id = DP83822_PHY_ID,
> +	 .phy_id_mask = 0xfffffff0,
> +	 .name = "TI DP83822",
> +	 .features = PHY_BASIC_FEATURES,
> +	 .flags = PHY_HAS_INTERRUPT,
> +
> +	 .config_init = genphy_config_init,
> +	 .soft_reset = dp83822_phy_reset,
> +
> +	 .get_wol = dp83822_get_wol,
> +	 .set_wol = dp83822_set_wol,
> +
> +	 /* IRQ related */
> +	 .ack_interrupt = dp83822_ack_interrupt,
> +	 .config_intr = dp83822_config_intr,
> +
> +	 .config_aneg = genphy_config_aneg,
> +	 .read_status = genphy_read_status,
> +	 .suspend = dp83822_suspend,
> +	 .resume = dp83822_resume,
> +	 },

I would omit newlines between definitions of callbacks, but this is
really a personal preference. Unless you are planning on adding new IDs,
you could also avoid using an array of 1 element and just a plain
phy_driver structure, but that's not a big deal either.
-- 
Florian

^ permalink raw reply

* Re: [patch net-next v2 0/7] mlxsw: Add support for partial multicast route offload
From: David Miller @ 2017-10-03 17:07 UTC (permalink / raw)
  To: jiri
  Cc: netdev, yotamg, idosch, mlxsw, nikolay, andrew, dsa, edumazet,
	willemb, johannes.berg, dcaratti, pabeni, daniel, f.fainelli, fw,
	gfree.wind
In-Reply-To: <20171003075812.1540-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Tue,  3 Oct 2017 09:58:05 +0200

> From: Jiri Pirko <jiri@mellanox.com>
> 
> Yotam says:
> 
> Previous patchset introduced support for offloading multicast MFC routes to
> the Spectrum hardware. As described in that patchset, no partial offloading
> is supported, i.e if a route has one output interface which is not a valid
> offloadable device (e.g. pimreg device, dummy device, management NIC), the
> route is trapped to the CPU and the forwarding is done in slow-path.
> 
> Add support for partial offloading of multicast routes, by letting the
> hardware to forward the packet to all the in-hardware devices, while the
> kernel ipmr module will continue forwarding to all other interfaces.
 ...

Series applied, thanks.

^ permalink raw reply

* Re: [v2] cdc-ether: divorce initialisation with a filter reset and a generic method
From: David Miller @ 2017-10-03 16:59 UTC (permalink / raw)
  To: vigneshr; +Cc: oneukum, netdev, bjorn
In-Reply-To: <fbe82c0f-c3d5-4dd6-62a7-b8fd2ea83302@ti.com>

From: Vignesh R <vigneshr@ti.com>
Date: Tue, 3 Oct 2017 13:39:18 +0530

> Hi Dave,
> 
> On Monday 22 May 2017 06:20 PM, Oliver Neukum wrote:
>> Some devices need their multicast filter reset but others are crashed by that.
>> So the methods need to be separated.
>> 
>> Signed-off-by: Oliver Neukum <oneukum@suse.com>
>> Reported-by: "Ridgway, Keith" <kridgway@harris.com>
>> ---
> 
> I see this patch was merged and said to be queued for -stable, but I
> don't see this fix in any of the stable kernels. Would you please ask
> for this patch to be backported to -stable kernels?

I submit networking -stable patches at a time of my own choosing.
Usually 1 to 2 weeks after it gets applied to my tree.

Please be patient.

^ permalink raw reply

* [PATCH v3 2/2] ARM: sun8i: h2+: specify wifi interrupts for Orange Pi Zero
From: Icenowy Zheng @ 2017-10-03 16:59 UTC (permalink / raw)
  To: Kalle Valo, Rob Herring, Maxime Ripard, Chen-Yu Tsai
  Cc: linux-wireless, netdev, devicetree, linux-kernel,
	linux-arm-kernel, linux-sunxi, Sergey Matyukevich, Icenowy Zheng
In-Reply-To: <20171003165944.13056-1-icenowy@aosc.io>

From: Sergey Matyukevich <geomatsi@gmail.com>

The Orange Pi Zero board has Allwinner XR819 SDIO wifi chip. The board
dts file provides a node enabling mmc1 controller, and a out-of-band
interrupt line of the chip is also connected, although the chip also
supports in-band interrupt.

The current out-of-tree driver is hardcoded to use out-of-band interrupt
as default, and it needs to be modified to use the in-band interrupt.

This commit adds the out-of-band interrupt line into the device tree.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
[Icenowy: Changed vendor prefix to allwinner and modify commit message]
Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
---
Changes in v3 by Icenowy:
- Change the compatible string vendor prefix to "allwinner".
- Modify the commit message.
Changes in v2 by Sergey:
- Adds the compatible string.

 arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
index b1502df7b509..6595617204b3 100644
--- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
+++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts
@@ -127,6 +127,9 @@
 	 */
 	xr819: sdio_wifi@1 {
 		reg = <1>;
+		compatible = "allwinner,xr819";
+		interrupt-parent = <&pio>;
+		interrupts = <6 10 IRQ_TYPE_EDGE_RISING>;
 	};
 };
 
-- 
2.13.5

^ permalink raw reply related

* [PATCH v3 1/2] dt-bindings: add device tree binding for Allwinner XR819 SDIO Wi-Fi
From: Icenowy Zheng @ 2017-10-03 16:59 UTC (permalink / raw)
  To: Kalle Valo, Rob Herring, Maxime Ripard, Chen-Yu Tsai
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-sunxi-/JYPxA39Uh5TLH3MbocFFw, Icenowy Zheng
In-Reply-To: <20171003165944.13056-1-icenowy-h8G6r0blFSE@public.gmane.org>

Allwinner XR819 is a SDIO Wi-Fi chip, which has the functionality to use
an out-of-band interrupt pin instead of SDIO in-band interrupt.

Add the device tree binding of this chip, in order to make it possible
to add this interrupt pin to device trees.

Signed-off-by: Icenowy Zheng <icenowy-h8G6r0blFSE@public.gmane.org>
Acked-by: Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
Changes in v3:
- Renames the node name.
- Adds ACK from Rob.
Changes in v2:
- Removed status property in example.
- Added required property reg.

 .../bindings/net/wireless/allwinner,xr819.txt      | 38 ++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/wireless/allwinner,xr819.txt

diff --git a/Documentation/devicetree/bindings/net/wireless/allwinner,xr819.txt b/Documentation/devicetree/bindings/net/wireless/allwinner,xr819.txt
new file mode 100644
index 000000000000..7ae40441e343
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/wireless/allwinner,xr819.txt
@@ -0,0 +1,38 @@
+Allwinner XRadio wireless SDIO devices
+
+This node provides properties for controlling the XRadio wireless device. The
+node is expected to be specified as a child node to the SDIO controller that
+connects the device to the system.
+
+Required properties:
+
+ - reg : The SDIO function number, see "Use of function subnodes" in
+	../../mmc/mmc.txt.
+ - compatible : Should be "allwinner,xr819".
+
+Optional properties:
+ - interrupt-parent : the phandle for the interrupt controller to which the
+	device interrupts are connected.
+ - interrupts : specifies attributes for the out-of-band interrupt (host-wake).
+	When not specified the device will use in-band SDIO interrupts.
+
+Example:
+
+mmc1: mmc@01c10000 {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&mmc1_pins_a>;
+	vmmc-supply = <&reg_vcc_wifi>;
+	mmc-pwrseq = <&wifi_pwrseq>;
+	bus-width = <4>;
+	non-removable;
+
+	xr819: wifi@1 {
+		reg = <1>;
+		compatible = "allwinner,xr819";
+		interrupt-parent = <&pio>;
+		interrupts = <6 10 IRQ_TYPE_EDGE_RISING>;
+	};
+};
-- 
2.13.5

^ permalink raw reply related

* [PATCH v3 0/2] Allwinner XR819 SDIO Wi-Fi DT binding and OPi Zero XR819 IRQ
From: Icenowy Zheng @ 2017-10-03 16:59 UTC (permalink / raw)
  To: Kalle Valo, Rob Herring, Maxime Ripard, Chen-Yu Tsai
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-sunxi-/JYPxA39Uh5TLH3MbocFFw, Icenowy Zheng

The Allwinner XR819 SDIO Wi-Fi chip supports an out-of-band interrupt line,
and the in-band interrupt is also supported.

However the current out-of-tree driver uses the out-of-band interrupt by
default.

This patchset adds the device tree binding for the chip as well as the
out-of-band interrupt, then adds the interrupt to the device tree of
Orange Pi Zero.

Icenowy Zheng (1):
  dt-bindings: add device tree binding for Allwinner XR819 SDIO Wi-Fi

Sergey Matyukevich (1):
  ARM: sun8i: h2+: specify wifi interrupts for Orange Pi Zero

 .../bindings/net/wireless/allwinner,xr819.txt      | 38 ++++++++++++++++++++++
 arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts  |  3 ++
 2 files changed, 41 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/wireless/allwinner,xr819.txt

-- 
2.13.5

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next] cxgb4: Update comment for min_mtu
From: David Miller @ 2017-10-03 16:56 UTC (permalink / raw)
  To: ganeshgr; +Cc: netdev, nirranjan, indranil, venkatesh, arjun
In-Reply-To: <1507011185-7447-1-git-send-email-ganeshgr@chelsio.com>

From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Tue,  3 Oct 2017 11:43:05 +0530

> From: Arjun Vynipadath <arjun@chelsio.com>
> 
> We have lost a comment for minimum mtu value set for netdevice with
> 'commit d894be57ca92 ("ethernet: use net core MTU range checking in
> more drivers"). Updating it accordingly.
> 
> Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>

Applied.

^ permalink raw reply

* Re: [PATCH V2] Fix a sleep-in-atomic bug in shash_setkey_unaligned
From: Andy Lutomirski @ 2017-10-03 16:46 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andy Lutomirski, Jia-Ju Bai, David S. Miller, Neil Horman,
	vyasevich, Kalle Valo, Linux Crypto Mailing List,
	Network Development, linux-sctp, Linux Wireless List
In-Reply-To: <20171003052643.GB22750@gondor.apana.org.au>

On Mon, Oct 2, 2017 at 10:26 PM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Oct 02, 2017 at 09:18:24PM -0700, Andy Lutomirski wrote:
>> > On Oct 2, 2017, at 7:25 PM, Jia-Ju Bai <baijiaju1990@163.com> wrote:
>> >
>> > The SCTP program may sleep under a spinlock, and the function call path is:
>> > sctp_generate_t3_rtx_event (acquire the spinlock)
>> >  sctp_do_sm
>> >    sctp_side_effects
>> >      sctp_cmd_interpreter
>> >        sctp_make_init_ack
>> >          sctp_pack_cookie
>> >            crypto_shash_setkey
>> >              shash_setkey_unaligned
>> >                kmalloc(GFP_KERNEL)
>>
>> I'm going to go out on a limb here: why on Earth is out crypto API so
>> full of indirection that we allocate memory at all here?
>
> The crypto API operates on a one key per-tfm basis.  So normally
> tfm allocation and key setting is done once only and not done on
> the data path.
>
> I have looked at the SCTP code and it appears to fit this paradigm.
> That is, we should be able to allocate the tfm and set the key when
> the key is actually generated via get_random_bytes, rather than every
> time the key is used which is not only a waste but as you see runs
> into API issues.

It's a waste because it loses a pre-computation advantage.

The fact that it has memory allocation issues is crypto API's fault,
full stop.  There is no legit reason to need to allocate anything.

^ permalink raw reply

* Re: [PATCH net] net: br: Fix igmp snooping offload with CONFIG_BRIDGE_VLAN_FILTERING
From: Ido Schimmel @ 2017-10-03 16:42 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: Andrew Lunn, Toshiaki Makita, Toshiaki Makita, David Miller,
	netdev
In-Reply-To: <87infwtd0b.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

On Tue, Oct 03, 2017 at 12:25:08PM -0400, Vivien Didelot wrote:
> Andrew Lunn <andrew@lunn.ch> writes:
> 
> >> The vlan will be effective only when vlan_filtering is enabled.
> >> When vlan_filtering is disabled, vlan information is still kept in the
> >> bridge and gets effective later when vlan_filtering becomes enable.
> >
> > O.K, so things are starting to get clearer.
> >
> > So when vlan filtering is disabled, the hardware should just ignore
> > the requests to add the vlan to the hardware?
> >
> > When vlan_filtering is enabled, are all the vlans in the software
> > bridge again offloaded? Or do we need to remember all the vlans which
> > we ignored while vlan filtering was disabled? The average switch has
> > nowhere to store these disabled vlans. It can only store active vlans.
> 
> When vlan_filtering is enabled on the bridge, the bridge code does
> propagates the default_pvid again if I recall correctly.
> 
> In my opinion the hardware mustn't ignore the VLAN requests, because we
> seem to agree that vlan_filtering disabled means that the target ports
> should not care yet about 802.1Q. So having some unused hardware VLAN
> entries and some ports with disabled 802.1Q mode must work together.
> 
> That being said we still have the wrong hardware FDB populated when
> CONFIG_BRIDGE_VLAN_FILTERING is enabled but not vlan_filtering...

The driver can make sure it's able to handle the configured
`vlan_filtering` state during port enslavement to the bridge and also
forbid it from being toggled once it's enslaved.

^ permalink raw reply

* Re: BUG in free_netdev() on ppp link deletion
From: Guillaume Nault @ 2017-10-03 16:40 UTC (permalink / raw)
  To: Beniamino Galvani
  Cc: linux-ppp, netdev, Paul Mackerras, David Ahern, Gao Feng
In-Reply-To: <20171003074413.GA26158@tp>

On Tue, Oct 03, 2017 at 09:44:14AM +0200, Beniamino Galvani wrote:
>  Call Trace:
>   ppp_destroy_interface+0xd8/0xe0 [ppp_generic]
>   ppp_disconnect_channel+0xda/0x110 [ppp_generic]
>   ppp_unregister_channel+0x5e/0x110 [ppp_generic]
>   pppox_unbind_sock+0x23/0x30 [pppox]
>   pppoe_connect+0x130/0x440 [pppoe]
>   SYSC_connect+0x98/0x110
>   ? do_fcntl+0x2c0/0x5d0
>   SyS_connect+0xe/0x10
>   entry_SYSCALL_64_fastpath+0x1a/0xa5
>  RIP: 0033:0x7fa71f4af840
>  RSP: 002b:00007ffe4ea40bf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
>  RAX: ffffffffffffffda RBX: 0000556d37ae0538 RCX: 00007fa71f4af840
>  RDX: 000000000000001e RSI: 00007ffe4ea40c00 RDI: 0000000000000008
>  RBP: 0000556d37b2a1b0 R08: 0000556d396e95b0 R09: 0000000000000008
>  R10: 00000000aaaaaaab R11: 0000000000000246 R12: 0000556d37adc008
>  R13: 0000556d37adc004 R14: 0000556d37b2a1a4 R15: 0000000000000000
>  Code: 04 00 00 04 e8 cb 52 e3 ff 5b 41 5c 41 5d 5d c3 41 0f b7 84 24 32 02 00 00 4c 89 e7 48 29 c7 e8 80 8b aa ff 5b 41 5c 41 5d 5d c3 <0f> 0b 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 57 41 
>  RIP: free_netdev+0x107/0x110 RSP: ffffc28a40573d88
>  ---[ end trace ed294ff0cc40eeff ]---
> 
> To reproduce this, establish a PPP connection through pppd, then bring
> down and delete the ppp interface:
> 
>  # pppd nodetach lock user client plugin rp-pppoe.so ens11 noauth nodeflate password password &
>  Plugin rp-pppoe.so loaded.
>  RP-PPPoE plugin version 3.8p compiled against pppd 2.4.7
>  PPP session is 16
>  Connected to fe:54:00:5f:04:13 via interface ens11
>  Using interface ppp0
>  Connect: ppp0 <--> ens11
>  CHAP authentication succeeded: Access granted
>  CHAP authentication succeeded
>  peer from calling number FE:54:00:5F:04:13 authorized
>  local  IP address 3.1.1.10
>  remote IP address 3.1.1.1
> 
>  # ip l set ppp0 down
>  # ip l del ppp0
> 
> It does not happen every time but only when ppp_destroy_interface() is
> called with dev->reg_state = UNREGISTERING, set by the concurrent
> rtnl_delete_link().
> 
Indeed, we have a race here: ppp_destroy_interface() can be called before
netdev_run_todo() completes. I'm working on it.

^ permalink raw reply

* Re: v4.14-rc2/arm64 kernel BUG at net/core/skbuff.c:2626
From: Eric Dumazet @ 2017-10-03 16:36 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Mark Rutland, LKML, netdev, linux-arm-kernel, syzkaller,
	David S. Miller, Willem de Bruijn
In-Reply-To: <CACT4Y+azruiH-oEXVFen8WiDHhz0PS3DF7OdrkSCJYVRmkhoiQ@mail.gmail.com>

On Tue, Oct 3, 2017 at 9:06 AM, Dmitry Vyukov <dvyukov@google.com> wrote:
> On Tue, Oct 3, 2017 at 5:38 PM, 'Eric Dumazet' via syzkaller
> <syzkaller@googlegroups.com> wrote:
>> On Tue, Oct 3, 2017 at 8:19 AM, Dmitry Vyukov <dvyukov@google.com> wrote:
>>> On Mon, Oct 2, 2017 at 4:42 PM, 'Eric Dumazet' via syzkaller
>>> <syzkaller@googlegroups.com> wrote:
>>>> On Mon, Oct 2, 2017 at 7:21 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>>>>> Hi Eric,
>>>>>
>>>>> On Mon, Oct 02, 2017 at 06:36:32AM -0700, Eric Dumazet wrote:
>>>>>> On Mon, Oct 2, 2017 at 3:49 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>>>>>> > I hit the below splat at net/core/skbuff.c:2626 while fuzzing v4.14-rc2
>>>>>> > on arm64 with Syzkaller. This is the BUG_ON(len) at the end of
>>>>>> > skb_copy_and_csum_bits().
>>>>>
>>>>>> > kernel BUG at net/core/skbuff.c:2626!
>>>>>
>>>>>> > [<ffff200009e03214>] skb_copy_and_csum_bits+0x8dc/0xae0 net/core/skbuff.c:2626
>>>>>> > [<ffff20000a01d244>] icmp_glue_bits+0xa4/0x2a0 net/ipv4/icmp.c:357
>>>>>> > [<ffff200009f3f0d4>] __ip_append_data+0x10e4/0x20a8 net/ipv4/ip_output.c:1018
>>>>>> > [<ffff200009f41a88>] ip_append_data.part.3+0xe8/0x1a0 net/ipv4/ip_output.c:1170
>>>>>> > [<ffff200009f46e74>] ip_append_data+0xa4/0xb0 net/ipv4/ip_output.c:1173
>>>>>> > [<ffff20000a01ccc8>] icmp_push_reply+0x1b8/0x690 net/ipv4/icmp.c:375
>>>>>> > [<ffff20000a0211b0>] icmp_send+0x1070/0x1890 net/ipv4/icmp.c:741
>>>>>> > [<ffff200009f41d48>] ip_fragment.constprop.4+0x208/0x340 net/ipv4/ip_output.c:552
>>>>>> > [<ffff200009f42228>] ip_finish_output+0x3a8/0xab0 net/ipv4/ip_output.c:315
>>>>>> > [<ffff200009f468c4>] NF_HOOK_COND include/linux/netfilter.h:238 [inline]
>>>>>> > [<ffff200009f468c4>] ip_output+0x284/0x790 net/ipv4/ip_output.c:405
>>>>>> > [<ffff200009f43204>] dst_output include/net/dst.h:458 [inline]
>>>>>> > [<ffff200009f43204>] ip_local_out+0x9c/0x1b8 net/ipv4/ip_output.c:124
>>>>>> > [<ffff200009f445e8>] ip_queue_xmit+0x850/0x18e0 net/ipv4/ip_output.c:504
>>>>>> > [<ffff200009fb091c>] tcp_transmit_skb+0x107c/0x3338 net/ipv4/tcp_output.c:1123
>>>>>> > [<ffff200009fbbcc4>] __tcp_retransmit_skb+0x614/0x1d18 net/ipv4/tcp_output.c:2847
>>>>>> > [<ffff200009fbd840>] tcp_send_loss_probe+0x478/0x7d0 net/ipv4/tcp_output.c:2457
>>>>>> > [<ffff200009fc707c>] tcp_write_timer_handler+0x50c/0x7e8 net/ipv4/tcp_timer.c:557
>>>>>> > [<ffff200009fc73d0>] tcp_write_timer+0x78/0x170 net/ipv4/tcp_timer.c:579
>>>>>> > [<ffff2000082f8980>] call_timer_fn+0x1b8/0x430 kernel/time/timer.c:1281
>>>>>> > [<ffff2000082f8dcc>] expire_timers+0x1d4/0x320 kernel/time/timer.c:1320
>>>>>> > [<ffff2000082f912c>] __run_timers kernel/time/timer.c:1620 [inline]
>>>>>> > [<ffff2000082f912c>] run_timer_softirq+0x214/0x5f0 kernel/time/timer.c:1646
>>>>>> > [<ffff2000080826c0>] __do_softirq+0x350/0xc0c kernel/softirq.c:284
>>>>>> > [<ffff200008170af4>] do_softirq_own_stack include/linux/interrupt.h:498 [inline]
>>>>>> > [<ffff200008170af4>] invoke_softirq kernel/softirq.c:371 [inline]
>>>>>> > [<ffff200008170af4>] irq_exit+0x1dc/0x2f8 kernel/softirq.c:405
>>>>>> > [<ffff2000082a95bc>] __handle_domain_irq+0xdc/0x230 kernel/irq/irqdesc.c:647
>>>>>> > [<ffff2000080820ac>] handle_domain_irq include/linux/irqdesc.h:175 [inline]
>>>>>> > [<ffff2000080820ac>] gic_handle_irq+0x6c/0xe0 drivers/irqchip/irq-gic.c:367
>>>>>
>>>>>> This is most likely a bug caused by syzkaller setting a ridiculous MTU
>>>>>> on loopback device, below minimum size of ipv4 MTU.
>>>>>
>>>>>> I tried to track it in August [1], but it seems hard to find all the
>>>>>> issues with this.
>>>>>>
>>>>>> commit c780a049f9bf442314335372c9abc4548bfe3e44
>>>>>> Author: Eric Dumazet <edumazet@google.com>
>>>>>> Date:   Wed Aug 16 11:09:12 2017 -0700
>>>>>>
>>>>>>     ipv4: better IP_MAX_MTU enforcement
>>>>>>
>>>>>>     While working on yet another syzkaller report, I found
>>>>>>     that our IP_MAX_MTU enforcements were not properly done.
>>>>>>
>>>>>>     gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
>>>>>>     final result can be bigger than IP_MAX_MTU :/
>>>>>>
>>>>>>     This is a problem because device mtu can be changed on other cpus or
>>>>>>     threads.
>>>>>>
>>>>>>     While this patch does not fix the issue I am working on, it is
>>>>>>     probably worth addressing it.
>>>>>
>>>>> Just to check I've understood correctly, are you suggesting that the
>>>>> IPv4 code should also check the dev->mtu against a IP_MIN_MTU (which
>>>>> doesn't seem to exist today)?
>>>>
>>>> We have plenty of places this is checked.
>>>>
>>>> For example, trying to set MTU < 68 usually removes IPv4 addresses and routes.
>>>>
>>>> Problem is : these checks are not fool proof yet.
>>>>
>>>> ( Only the admin was supposed to play these games )
>>>>
>>>>>
>>>>> Otherwise, I do spot another potential issue. The writer side (e.g. most
>>>>> net_device::ndo_change_mtu implementations and the __dev_set_mtu()
>>>>> fallback) doesn't use WRITE_ONCE().
>>>>
>>>> It does not matter how many strange values can be observed by the reader :
>>>> We must be fool proof anyway from reader point of view, so the
>>>> WRITE_ONCE() is not strictly needed.
>>>
>>>
>>> Note if writer stores some temporal garbage there (which C language
>>> perfectly allows), it does not matter what we do on reader side --
>>> reader won't get correct data anyway. Say mtu changes from 1000 to
>>> 2000, but writer temporary stores 1 there, reader can observe 1 while
>>> it must not. Synchronization is always a game of two.
>>
>> Since we have no sync here, a reader _must_ cope with any MTU value.
>>
>> We need to care of any value, so we do not care how dummy writers can be.
>>
>> Sure, a WRITE_ONCE() will help avoiding some strange values being written,
>>  but since we _allow_ writers to write such strange values,
>> there is really no point pretending to be safe here.
>>
>> Adding a WRITE_ONCE() will not fix the bug.
>
>
> Reader must cope with any value. But there is an additional
> requirement that it must behave correctly.

As soon as we allow admin to change MTU on a device, we _are_ dropping packets.
This is absolutely normal so far.

On most NIC , an MTU change closes and opens the port, typically
blocking the device for several seconds.

Networking is best effort, there is no 'requirement' that networking
stack is supposed to cope with any events,
including silly admins.

The following is still legal

while :
do
 ifconfig lo mu random
done

And nothing asks us to deliver any packet while this loop is running.

We only have to prevent crashes ;)

> If mtu was 1000 and then
> reset to 2000 once (and not other manipulations with mtu), then
> correct behavior is either (1) sending packets with mtu 1000 or (2)
> sending packets with mtu 2000 (after mtu change) and nothing else.
> Sending packets with mtu 500, dropping packets because mtu is observed
> to be 1, or formatting hard drive are all incorrect behaviors and must
> not happen.
>
> What you say is valid for communication with user-space
> (copy_form_user, etc). Because there we don't control write side and
> racy writes are indistinguishable from intentional writes that do the
> same.

This is absolutely not relevant to the bug we are looking at at this
moment, even if true.

Unless you found the root cause, I will kindly ask you to open another
thread on lkml
to not pollute this one.

Thank you !

^ permalink raw reply

* Re: [PATCH iproute2] iproute: build more easily on Android
From: Lorenzo Colitti @ 2017-10-03 16:35 UTC (permalink / raw)
  To: enh; +Cc: Stephen Hemminger, netdev@vger.kernel.org
In-Reply-To: <CAJgzZor7VMuTWgRUvVp_q=DHEm5bdgHNWxBJ3wG+X9GWTEwB7Q@mail.gmail.com>

On Tue, Oct 3, 2017 at 5:23 AM, enh <enh@google.com> wrote:
>> Rather than moving everything, why not make kernel headers directory
>> configurable as part of the configure script setup process.
>
> the problem is that C libraries with their our own uapi headers still
> need your app-specific headers. to build iproute2 we need to put
> iproute2's include/ on our include path, but then the fact that your
> different uapi headers are *under* that directory causes the conflict.

Right - when building iproute2 we must have .../iproute2/include in
the include paths.

So when, say, ip/link_iptnl.c does #include <linux/in.h>, that file is
in two places in the path - the C library includes and the iproute
includes, and those two files conflict with each other.

There's no way to tell the compiler "use external/iproute2/include but
not external/iproute2/include/linux". But if the iproute2 files are in
uapi/ , then a simple #include <linux/in.h> won't find the UAPI copy
and the files won't conflict.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox