Netdev List

Netdev List
 help / color / mirror / Atom feed

* [iproute PATCH v2] ip-route: Fix segfault with many nexthops
From: Phil Sutter @ 2018-09-06 13:31 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

It was possible to crash ip-route by adding an IPv6 route with 37
nexthop statements. A simple reproducer is:

| for i in `seq 37`; do
| 	nhs="nexthop via 1111::$i "$nhs
| done
| ip -6 route add 3333::/64 $nhs

The related code was broken in multiple ways:

* parse_one_nh() assumed that rta points to 4kB of storage but caller
  provided just 1kB. Fixed by passing 'len' parameter with the correct
  value.

* Error checking of rta_addattr*() calls in parse_one_nh() and called
  functions was completely absent, so with above fix in place output
  flood would occur due to parser looping forever.

While being at it, increase message buffer sizes to 4k. This allows for
at most 144 nexthops.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- Remove accidentally added 'return 0' line from parse_nexthops().
- Increase buffer sizes.
---
 ip/iproute.c          |  43 ++++++++++-------
 ip/iproute_lwtunnel.c | 108 +++++++++++++++++++++++++-----------------
 2 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 30833414a3f7f..398322fd1f4ff 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -941,7 +941,7 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 }
 
 static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
-			struct rtattr *rta, struct rtnexthop *rtnh,
+			struct rtattr *rta, size_t len, struct rtnexthop *rtnh,
 			int *argcp, char ***argvp)
 {
 	int argc = *argcp;
@@ -962,11 +962,16 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (r->rtm_family == AF_UNSPEC)
 				r->rtm_family = addr.family;
 			if (addr.family == r->rtm_family) {
-				rta_addattr_l(rta, 4096, RTA_GATEWAY, &addr.data, addr.bytelen);
-				rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
+				if (rta_addattr_l(rta, len, RTA_GATEWAY,
+						  &addr.data, addr.bytelen))
+					return -1;
+				rtnh->rtnh_len += sizeof(struct rtattr)
+						  + addr.bytelen;
 			} else {
-				rta_addattr_l(rta, 4096, RTA_VIA, &addr.family, addr.bytelen+2);
-				rtnh->rtnh_len += RTA_SPACE(addr.bytelen+2);
+				if (rta_addattr_l(rta, len, RTA_VIA,
+						  &addr.family, addr.bytelen + 2))
+					return -1;
+				rtnh->rtnh_len += RTA_SPACE(addr.bytelen + 2);
 			}
 		} else if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
@@ -988,13 +993,15 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			NEXT_ARG();
 			if (get_rt_realms_or_raw(&realm, *argv))
 				invarg("\"realm\" value is invalid\n", *argv);
-			rta_addattr32(rta, 4096, RTA_FLOW, realm);
+			if (rta_addattr32(rta, len, RTA_FLOW, realm))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + 4;
 		} else if (strcmp(*argv, "encap") == 0) {
-			int len = rta->rta_len;
+			int old_len = rta->rta_len;
 
-			lwt_parse_encap(rta, 4096, &argc, &argv);
-			rtnh->rtnh_len += rta->rta_len - len;
+			if (lwt_parse_encap(rta, len, &argc, &argv))
+				return -1;
+			rtnh->rtnh_len += rta->rta_len - old_len;
 		} else if (strcmp(*argv, "as") == 0) {
 			inet_prefix addr;
 
@@ -1002,8 +1009,9 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (strcmp(*argv, "to") == 0)
 				NEXT_ARG();
 			get_addr(&addr, *argv, r->rtm_family);
-			rta_addattr_l(rta, 4096, RTA_NEWDST, &addr.data,
-				      addr.bytelen);
+			if (rta_addattr_l(rta, len, RTA_NEWDST,
+					  &addr.data, addr.bytelen))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
 		} else
 			break;
@@ -1016,7 +1024,7 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 			  int argc, char **argv)
 {
-	char buf[1024];
+	char buf[4096];
 	struct rtattr *rta = (void *)buf;
 	struct rtnexthop *rtnh;
 
@@ -1036,7 +1044,7 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 		memset(rtnh, 0, sizeof(*rtnh));
 		rtnh->rtnh_len = sizeof(*rtnh);
 		rta->rta_len += rtnh->rtnh_len;
-		if (parse_one_nh(n, r, rta, rtnh, &argc, &argv)) {
+		if (parse_one_nh(n, r, rta, 4096, rtnh, &argc, &argv)) {
 			fprintf(stderr, "Error: cannot parse nexthop\n");
 			exit(-1);
 		}
@@ -1044,7 +1052,8 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 	}
 
 	if (rta->rta_len > RTA_LENGTH(0))
-		addattr_l(n, 1024, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta));
+		return addattr_l(n, 4096, RTA_MULTIPATH,
+				 RTA_DATA(rta), RTA_PAYLOAD(rta));
 	return 0;
 }
 
@@ -1053,7 +1062,7 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 	struct {
 		struct nlmsghdr	n;
 		struct rtmsg		r;
-		char			buf[1024];
+		char			buf[4096];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
 		.n.nlmsg_flags = NLM_F_REQUEST | flags,
@@ -1484,8 +1493,8 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 		addattr_l(&req.n, sizeof(req), RTA_METRICS, RTA_DATA(mxrta), RTA_PAYLOAD(mxrta));
 	}
 
-	if (nhs_ok)
-		parse_nexthops(&req.n, &req.r, argc, argv);
+	if (nhs_ok && parse_nexthops(&req.n, &req.r, argc, argv))
+		return -1;
 
 	if (req.r.rtm_family == AF_UNSPEC)
 		req.r.rtm_family = AF_INET;
diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c
index e604481142ec1..969a4763df71d 100644
--- a/ip/iproute_lwtunnel.c
+++ b/ip/iproute_lwtunnel.c
@@ -538,8 +538,9 @@ static int parse_encap_seg6(struct rtattr *rta, size_t len, int *argcp,
 
 	memcpy(tuninfo->srh, srh, srhlen);
 
-	rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
-		      sizeof(*tuninfo) + srhlen);
+	if (rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
+			  sizeof(*tuninfo) + srhlen))
+		return -1;
 
 	free(tuninfo);
 	free(srh);
@@ -611,6 +612,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 	char segbuf[1024];
 	inet_prefix addr;
 	__u32 hmac = 0;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "action") == 0) {
@@ -620,27 +622,28 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			action = read_action_type(*argv);
 			if (!action)
 				invarg("\"action\" value is invalid\n", *argv);
-			rta_addattr32(rta, len, SEG6_LOCAL_ACTION, action);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_ACTION,
+					    action);
 		} else if (strcmp(*argv, "table") == 0) {
 			NEXT_ARG();
 			if (table_ok++)
 				duparg2("table", *argv);
 			get_u32(&table, *argv, 0);
-			rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
 		} else if (strcmp(*argv, "nh4") == 0) {
 			NEXT_ARG();
 			if (nh4_ok++)
 				duparg2("nh4", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH4, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH4,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "nh6") == 0) {
 			NEXT_ARG();
 			if (nh6_ok++)
 				duparg2("nh6", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH6, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH6,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "iif") == 0) {
 			NEXT_ARG();
 			if (iif_ok++)
@@ -648,7 +651,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			iif = ll_name_to_index(*argv);
 			if (!iif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
 		} else if (strcmp(*argv, "oif") == 0) {
 			NEXT_ARG();
 			if (oif_ok++)
@@ -656,7 +659,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			oif = ll_name_to_index(*argv);
 			if (!oif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
 		} else if (strcmp(*argv, "srh") == 0) {
 			NEXT_ARG();
 			if (srh_ok++)
@@ -691,6 +694,8 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		} else {
 			break;
 		}
+		if (ret)
+			return ret;
 		argc--; argv++;
 	}
 
@@ -705,14 +710,14 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		srh = parse_srh(segbuf, hmac,
 				action == SEG6_LOCAL_ACTION_END_B6_ENCAP);
 		srhlen = (srh->hdrlen + 1) << 3;
-		rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
+		ret = rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
 		free(srh);
 	}
 
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_mpls(struct rtattr *rta, size_t len,
@@ -730,8 +735,9 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 		exit(1);
 	}
 
-	rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST, &addr.data,
-		      addr.bytelen);
+	if (rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST,
+			  &addr.data, addr.bytelen))
+		return -1;
 
 	argc--;
 	argv++;
@@ -745,7 +751,8 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl);
+			if (rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl))
+				return -1;
 		} else {
 			break;
 		}
@@ -768,6 +775,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -778,7 +786,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -786,8 +794,8 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tos") == 0) {
 			__u32 tos;
 
@@ -796,7 +804,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("tos", *argv);
 			if (rtnl_dsfield_a2n(&tos, *argv))
 				invarg("\"tos\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
 		} else if (strcmp(*argv, "ttl") == 0) {
 			__u8 ttl;
 
@@ -805,10 +813,12 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -819,7 +829,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ila(struct rtattr *rta, size_t len,
@@ -828,6 +838,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	__u64 locator;
 	int argc = *argcp;
 	char **argv = *argvp;
+	int ret = 0;
 
 	if (get_addr64(&locator, *argv) < 0) {
 		fprintf(stderr, "Bad locator: %s\n", *argv);
@@ -836,7 +847,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 
 	argc--; argv++;
 
-	rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator);
+	if (rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator))
+		return -1;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "csum-mode") == 0) {
@@ -849,8 +861,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"csum-mode\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
-				     (__u8)csum_mode);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
+					   (__u8)csum_mode);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "ident-type") == 0) {
@@ -863,8 +875,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"ident-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
-				     (__u8)ident_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
+					   (__u8)ident_type);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "hook-type") == 0) {
@@ -877,13 +889,15 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"hook-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
-				     (__u8)hook_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
+					   (__u8)hook_type);
 
 			argc--; argv++;
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 	}
 
 	/* argv is currently the first unparsed argument,
@@ -893,7 +907,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ip6(struct rtattr *rta, size_t len,
@@ -902,6 +916,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -912,7 +927,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -920,8 +935,8 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tc") == 0) {
 			__u32 tc;
 
@@ -930,7 +945,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("tc", *argv);
 			if (rtnl_dsfield_a2n(&tc, *argv))
 				invarg("\"tc\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
 		} else if (strcmp(*argv, "hoplimit") == 0) {
 			__u8 hoplimit;
 
@@ -940,10 +955,13 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (get_u8(&hoplimit, *argv, 0))
 				invarg("\"hoplimit\" value is invalid\n",
 				       *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT,
+					   hoplimit);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -954,7 +972,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static void lwt_bpf_usage(void)
@@ -1021,6 +1039,7 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	int argc = *argcp;
 	char **argv = *argvp;
 	__u16 type;
+	int ret = 0;
 
 	NEXT_ARG();
 	type = read_encap_type(*argv);
@@ -1037,37 +1056,40 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	nest = rta_nest(rta, 1024, RTA_ENCAP);
 	switch (type) {
 	case LWTUNNEL_ENCAP_MPLS:
-		parse_encap_mpls(rta, len, &argc, &argv);
+		ret = parse_encap_mpls(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP:
-		parse_encap_ip(rta, len, &argc, &argv);
+		ret = parse_encap_ip(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_ILA:
-		parse_encap_ila(rta, len, &argc, &argv);
+		ret = parse_encap_ila(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP6:
-		parse_encap_ip6(rta, len, &argc, &argv);
+		ret = parse_encap_ip6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_BPF:
 		if (parse_encap_bpf(rta, len, &argc, &argv) < 0)
 			exit(-1);
 		break;
 	case LWTUNNEL_ENCAP_SEG6:
-		parse_encap_seg6(rta, len, &argc, &argv);
+		ret = parse_encap_seg6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_SEG6_LOCAL:
-		parse_encap_seg6local(rta, len, &argc, &argv);
+		ret = parse_encap_seg6local(rta, len, &argc, &argv);
 		break;
 	default:
 		fprintf(stderr, "Error: unsupported encap type\n");
 		break;
 	}
+	if (ret)
+		return ret;
+
 	rta_nest_end(rta, nest);
 
-	rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
+	ret = rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
 
 	*argcp = argc;
 	*argvp = argv;
 
-	return 0;
+	return ret;
 }
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH net-next v2 3/7] net: aquantia: implement WOL support
From: Andrew Lunn @ 2018-09-06 13:34 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Yana Esina, Nikita Danilov
In-Reply-To: <66632a61b9d43ffe804de8abf3a09ec825ff2754.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:05:58PM +0300, Igor Russkikh wrote:
> From: Yana Esina <yana.esina@aquantia.com>
> 
> Add WOL support. Currently only magic packet
> (ethtool -s <ethX> wol g) feature is implemented.
> 
> Remove hw_set_power and move that to FW_OPS set_power:
> because WOL configuration behaves differently on 1x and 2x
> firmwares

Hi Igor, Yana

It looks like the set_power refactor could of been a patch of its own.
Then add the WOL support as a second patch.

In the future, try to make a patch do one thing, and only one
thing. It makes them easier to review.

   Andrew

^ permalink raw reply

* RE: [PATCH v2 1/2] net: ethernet: i40e: fix build error
From: Wyborny, Carolyn @ 2018-09-06 18:15 UTC (permalink / raw)
  To: Andrew Lunn, Keller, Jacob E
  Cc: Wang, Dongsheng, Kirsher, Jeffrey T,
	sergei.shtylyov@cogentembedded.com, davem@davemloft.net,
	intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180906180316.GB26997@lunn.ch>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Andrew Lunn
> Sent: Thursday, September 06, 2018 11:03 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: Wang, Dongsheng <dongsheng.wang@hxt-semitech.com>; Kirsher,
> Jeffrey T <jeffrey.t.kirsher@intel.com>;
> sergei.shtylyov@cogentembedded.com; davem@davemloft.net; intel-
> wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] net: ethernet: i40e: fix build error
> 
[..]
> You have a few options here.
> 
> 1) A library module, containing shared code. Use EXPORT_SYMBOL_GPL()
> in the library module, and the kernel runtime linker will link the
> calls into the library. Also, modprobe will ensure the library module
> is loaded first, before the driver module.
> 
> 2) Build time sharing of code. Place the shared code into a .o file,
> and link it to both modules.
> 
> There is nothing particularly difficult here, this all done lots of
> times within the kernel. Just look around and see how others do it.

Thanks Andrew,

Yes, I agree and we do have a team working on doing this.

Carolyn

Carolyn Wyborny 
Linux Development 
Networking Division 
Intel Corporation 

^ permalink raw reply

* Re: [PATCH net-next v2 5/7] net: aquantia: whitespace changes
From: Andrew Lunn @ 2018-09-06 13:42 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Nikita Danilov
In-Reply-To: <5bf4e7aed7f6d93dcfd67e6aabc55561a28e0f70.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:06:00PM +0300, Igor Russkikh wrote:
> From: Nikita Danilov <nikita.danilov@aquantia.com>
> 
> Removed extra spaces, corrected alignment.
> 
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next v2 6/7] net: aquantia: renaming for better visibility
From: Andrew Lunn @ 2018-09-06 13:44 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Nikita Danilov
In-Reply-To: <12f20049c9ca52bacef91a2d2e6f71f05a2fbbb2.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:06:01PM +0300, Igor Russkikh wrote:
> From: Nikita Danilov <nikita.danilov@aquantia.com>
> 
> Removed extra characters from the names of structures to unify prefixes
> used through the driver code (we normally use hw_atl for hw specifics).
> HW_ATL_B0_ and HW_ATL_A0_ are the same and useless copies.
> 
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next v2 3/7] net: aquantia: implement WOL support
From: Andrew Lunn @ 2018-09-06 13:44 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Yana Esina, Nikita Danilov
In-Reply-To: <66632a61b9d43ffe804de8abf3a09ec825ff2754.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:05:58PM +0300, Igor Russkikh wrote:
> From: Yana Esina <yana.esina@aquantia.com>
> 
> Add WOL support. Currently only magic packet
> (ethtool -s <ethX> wol g) feature is implemented.
> 
> Remove hw_set_power and move that to FW_OPS set_power:
> because WOL configuration behaves differently on 1x and 2x
> firmwares
> 
> Signed-off-by: Yana Esina <yana.esina@aquantia.com>
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Tested-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [PATCH] tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY
From: Vincent Whitchurch @ 2018-09-06 13:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, willemb, Vincent Whitchurch

According to the documentation in msg_zerocopy.rst, the SO_ZEROCOPY
flag was introduced because send(2) ignores unknown message flags and
any legacy application which was accidentally passing the equivalent of
MSG_ZEROCOPY earlier should not see any new behaviour.

Before commit f214f915e7db ("tcp: enable MSG_ZEROCOPY"), a send(2) call
which passed the equivalent of MSG_ZEROCOPY without setting SO_ZEROCOPY
would succeed.  However, after that commit, it fails with -ENOBUFS.  So
it appears that the SO_ZEROCOPY flag fails to fulfill its intended
purpose.  Fix it.

Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
 net/core/skbuff.c | 3 ---
 net/ipv4/tcp.c    | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09d095f..b2c807f67aba 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -939,9 +939,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)

 	WARN_ON_ONCE(!in_task());

-	if (!sock_flag(sk, SOCK_ZEROCOPY))
-		return NULL;
-
 	skb = sock_omalloc(sk, 0, GFP_KERNEL);
 	if (!skb)
 		return NULL;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8af2fec5ad5..10c6246396cc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1185,7 +1185,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)

 	flags = msg->msg_flags;

-	if (flags & MSG_ZEROCOPY && size) {
+	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
 		if (sk->sk_state != TCP_ESTABLISHED) {
 			err = -EINVAL;
 			goto out_err;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next] net: dsa: b53: Fix build with B53_SRAB enabled and not B53_SERDES
From: Florian Fainelli @ 2018-09-06 18:42 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, Andrew Lunn, Vivien Didelot, David S. Miller,
	open list

In case B53_SRAB is enabled, but not B53_SERDES, we can get the
following linking error:

ERROR: "b53_serdes_init" [drivers/net/dsa/b53/b53_srab.ko] undefined!

We also need to ifdef the body of b53_srab_serdes_map_lane() since it
would not be used when B53_SERDES is disabled and that would produce a
warning.

Fixes: 0e01491de646 ("net: dsa: b53: Add SerDes support")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_serdes.h | 7 +++++++
 drivers/net/dsa/b53/b53_srab.c   | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_serdes.h b/drivers/net/dsa/b53/b53_serdes.h
index e0674aa0167f..eed7c9357091 100644
--- a/drivers/net/dsa/b53/b53_serdes.h
+++ b/drivers/net/dsa/b53/b53_serdes.h
@@ -118,4 +118,11 @@ void b53_serdes_link_set(struct b53_device *dev, int port, unsigned int mode,
 void b53_serdes_phylink_validate(struct b53_device *dev, int port,
 				unsigned long *supported,
 				struct phylink_link_state *state);
+#if IS_ENABLED(CONFIG_B53_SERDES)
 int b53_serdes_init(struct b53_device *dev, int port);
+#else
+static inline int b53_serdes_init(struct b53_device *dev, int port)
+{
+	return -ENODEV;
+}
+#endif
diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index 149788697fd6..b0ed81876bae 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -390,6 +390,7 @@ static irqreturn_t b53_srab_port_isr(int irq, void *dev_id)
 	return IRQ_WAKE_THREAD;
 }
 
+#if IS_ENABLED(CONFIG_B53_SERDES)
 static u8 b53_srab_serdes_map_lane(struct b53_device *dev, int port)
 {
 	struct b53_srab_priv *priv = dev->priv;
@@ -407,6 +408,7 @@ static u8 b53_srab_serdes_map_lane(struct b53_device *dev, int port)
 		return B53_INVALID_LANE;
 	}
 }
+#endif
 
 static int b53_srab_irq_enable(struct b53_device *dev, int port)
 {
-- 
2.17.1

^ permalink raw reply related

* [PATCH v2 net-next 0/4] net: batched receive in GRO path
From: Edward Cree @ 2018-09-06 14:24 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev

This series listifies part of GRO processing, in a manner which allows those
 packets which are not GROed (i.e. for which dev_gro_receive returns
 GRO_NORMAL) to be passed on to the listified regular receive path.
I have not listified dev_gro_receive() itself, or the per-protocol GRO
 callback, since GRO's need to hold packets on lists under napi->gro_hash
 makes keeping the packets on other lists awkward, and since the GRO control
 block state of held skbs can refer only to one 'new' skb at a time.
 Nonetheless the batching of the calling code yields some performance gains
 in the GRO case as well.

Herewith the performance figures obtained in a NetPerf TCP stream test (with
 four streams, and irqs bound to a single core):
net-next: 7.166 Gbit/s (sigma 0.435)
after #2: 7.715 Gbit/s (sigma 0.145) = datum + 7.7%
after #4: 7.890 Gbit/s (sigma 0.217) = datum + 10.1%
(Note that the 'net-next' results were distinctly bimodal, with two results
 of about 8 Gbit/s and the remaining ten around 7 Gbit/s.  I don't have a
 good explanation for this.)
And with GRO disabled through ethtool -K (thus simulating traffic which is
 not GRO-able but, being TCP, is still passed to the GRO entry point):
net-next: 4.756 Gbit/s (sigma 0.240)
after #4: 5.355 Gbit/s (sigma 0.232) = datum + 12.6%

v2: Rebased on latest net-next.  Removed RFC tags.  Otherwise unchanged
 owing to lack of comments on v1.

Edward Cree (4):
  net: introduce list entry point for GRO
  sfc: use batched receive for GRO
  net: make listified RX functions return number of good packets
  net/core: handle GRO_NORMAL skbs as a list in napi_gro_receive_list

 drivers/net/ethernet/sfc/efx.c        |  11 +++-
 drivers/net/ethernet/sfc/net_driver.h |   1 +
 drivers/net/ethernet/sfc/rx.c         |  16 +++++-
 include/linux/netdevice.h             |   6 +-
 include/net/ip.h                      |   4 +-
 include/net/ipv6.h                    |   4 +-
 net/core/dev.c                        | 104 ++++++++++++++++++++++++++--------
 net/ipv4/ip_input.c                   |  39 ++++++++-----
 net/ipv6/ip6_input.c                  |  37 +++++++-----
 9 files changed, 157 insertions(+), 65 deletions(-)

^ permalink raw reply

* [PATCH v2 net-next 1/4] net: introduce list entry point for GRO
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Also export napi_frags_skb() so that drivers using the napi_gro_frags()
 interface can prepare their SKBs properly for submitting on such a list.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 28 +++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e2b3bd750c98..2b53536b1d99 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3548,8 +3548,10 @@ int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
 void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
+int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..8df39ded77bd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5598,6 +5598,31 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
+/* Returns the number of SKBs on the list successfully received */
+int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+	gro_result_t result;
+	int kept = 0;
+
+	list_for_each_entry(skb, head, list) {
+		skb_mark_napi_id(skb, napi);
+		trace_napi_gro_receive_entry(skb);
+		skb_gro_reset_offset(skb);
+	}
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		list_del(&skb->list);
+		skb->next = NULL;
+		result = dev_gro_receive(napi, skb);
+		result = napi_skb_finish(result, skb);
+		if (result != GRO_DROP)
+			kept++;
+	}
+	return kept;
+}
+EXPORT_SYMBOL(napi_gro_receive_list);
+
 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
 	if (unlikely(skb->pfmemalloc)) {
@@ -5669,7 +5694,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
  * Drivers could call both napi_gro_frags() and napi_gro_receive()
  * We copy ethernet header into skb->data to have a common layout.
  */
-static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi->skb;
 	const struct ethhdr *eth;
@@ -5705,6 +5730,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 
 	return skb;
 }
+EXPORT_SYMBOL(napi_frags_skb);
 
 gro_result_t napi_gro_frags(struct napi_struct *napi)
 {

^ permalink raw reply related

* [PATCH v2 net-next 2/4] sfc: use batched receive for GRO
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        | 11 +++++++++--
 drivers/net/ethernet/sfc/net_driver.h |  1 +
 drivers/net/ethernet/sfc/rx.c         | 16 +++++++++++++---
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 330233286e78..dba13a28014c 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -263,9 +263,9 @@ static int efx_check_disabled(struct efx_nic *efx)
  */
 static int efx_process_channel(struct efx_channel *channel, int budget)
 {
+	struct list_head rx_list, gro_list;
 	struct efx_tx_queue *tx_queue;
-	struct list_head rx_list;
-	int spent;
+	int spent, gro_count;
 
 	if (unlikely(!channel->enabled))
 		return 0;
@@ -275,6 +275,10 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 	INIT_LIST_HEAD(&rx_list);
 	channel->rx_list = &rx_list;
 
+	EFX_WARN_ON_PARANOID(channel->gro_list != NULL);
+	INIT_LIST_HEAD(&gro_list);
+	channel->gro_list = &gro_list;
+
 	efx_for_each_channel_tx_queue(tx_queue, channel) {
 		tx_queue->pkts_compl = 0;
 		tx_queue->bytes_compl = 0;
@@ -300,6 +304,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 	/* Receive any packets we queued up */
 	netif_receive_skb_list(channel->rx_list);
 	channel->rx_list = NULL;
+	gro_count = napi_gro_receive_list(&channel->napi_str, channel->gro_list);
+	channel->irq_mod_score += gro_count * 2;
+	channel->gro_list = NULL;
 
 	return spent;
 }
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 961b92979640..72addac7a84a 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -502,6 +502,7 @@ struct efx_channel {
 	unsigned int rx_pkt_index;
 
 	struct list_head *rx_list;
+	struct list_head *gro_list;
 
 	struct efx_rx_queue rx_queue;
 	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 396ff01298cd..0534a54048c6 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -453,9 +453,19 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
 
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	gro_result = napi_gro_frags(napi);
-	if (gro_result != GRO_DROP)
-		channel->irq_mod_score += 2;
+	/* Pass the packet up */
+	if (channel->gro_list != NULL) {
+		/* Clear napi->skb and prepare skb for GRO */
+		skb = napi_frags_skb(napi);
+		if (skb)
+			/* Add to list, will pass up later */
+			list_add_tail(&skb->list, channel->gro_list);
+	} else {
+		/* No list, so pass it up now */
+		gro_result = napi_gro_frags(napi);
+		if (gro_result != GRO_DROP)
+			channel->irq_mod_score += 2;
+	}
 }
 
 /* Allocate and construct an SKB around page fragments */

^ permalink raw reply related

* [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  4 +--
 include/net/ip.h          |  4 +--
 include/net/ipv6.h        |  4 +--
 net/core/dev.c            | 63 +++++++++++++++++++++++++++++------------------
 net/ipv4/ip_input.c       | 39 ++++++++++++++++++-----------
 net/ipv6/ip6_input.c      | 37 +++++++++++++++++-----------
 6 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b53536b1d99..9b3fc5944ba5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2349,7 +2349,7 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
-	void			(*list_func) (struct list_head *,
+	int			(*list_func) (struct list_head *,
 					      struct packet_type *,
 					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
@@ -3546,7 +3546,7 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
-void netif_receive_skb_list(struct list_head *head);
+int netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
diff --git a/include/net/ip.h b/include/net/ip.h
index e44b1a44f67a..aab1f7eea1e1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -152,8 +152,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
-void ip_list_rcv(struct list_head *head, struct packet_type *pt,
-		 struct net_device *orig_dev);
+int ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ff33f498c137..f15651eabfe0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -914,8 +914,8 @@ static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
 
 int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
 	     struct packet_type *pt, struct net_device *orig_dev);
-void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
-		   struct net_device *orig_dev);
+int ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+		  struct net_device *orig_dev);
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8df39ded77bd..69e2819994e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4922,24 +4922,27 @@ int netif_receive_skb_core(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
-static inline void __netif_receive_skb_list_ptype(struct list_head *head,
-						  struct packet_type *pt_prev,
-						  struct net_device *orig_dev)
+static inline int __netif_receive_skb_list_ptype(struct list_head *head,
+						 struct packet_type *pt_prev,
+						 struct net_device *orig_dev)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	if (!pt_prev)
-		return;
+		return 0;
 	if (list_empty(head))
-		return;
+		return 0;
 	if (pt_prev->list_func != NULL)
-		pt_prev->list_func(head, pt_prev, orig_dev);
+		kept = pt_prev->list_func(head, pt_prev, orig_dev);
 	else
 		list_for_each_entry_safe(skb, next, head, list)
-			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+			if (pt_prev->func(skb, skb->dev, pt_prev, orig_dev) == NET_RX_SUCCESS)
+				kept++;
+	return kept;
 }
 
-static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+static int __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 {
 	/* Fast-path assumptions:
 	 * - There is no RX handler.
@@ -4956,6 +4959,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 	struct net_device *od_curr = NULL;
 	struct list_head sublist;
 	struct sk_buff *skb, *next;
+	int kept = 0, ret;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -4963,12 +4967,15 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 		struct packet_type *pt_prev = NULL;
 
 		list_del(&skb->list);
-		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
-		if (!pt_prev)
+		ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		if (!pt_prev) {
+			if (ret == NET_RX_SUCCESS)
+				kept++;
 			continue;
+		}
 		if (pt_curr != pt_prev || od_curr != orig_dev) {
 			/* dispatch old sublist */
-			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+			kept += __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			pt_curr = pt_prev;
@@ -4978,7 +4985,8 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 	}
 
 	/* dispatch final sublist */
-	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+	kept += __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+	return kept;
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
@@ -5006,11 +5014,12 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
-static void __netif_receive_skb_list(struct list_head *head)
+static int __netif_receive_skb_list(struct list_head *head)
 {
 	unsigned long noreclaim_flag = 0;
 	struct sk_buff *skb, *next;
 	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list) {
 		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
@@ -5019,7 +5028,7 @@ static void __netif_receive_skb_list(struct list_head *head)
 			/* Handle the previous sublist */
 			list_cut_before(&sublist, head, &skb->list);
 			if (!list_empty(&sublist))
-				__netif_receive_skb_list_core(&sublist, pfmemalloc);
+				kept += __netif_receive_skb_list_core(&sublist, pfmemalloc);
 			pfmemalloc = !pfmemalloc;
 			/* See comments in __netif_receive_skb */
 			if (pfmemalloc)
@@ -5030,10 +5039,11 @@ static void __netif_receive_skb_list(struct list_head *head)
 	}
 	/* Handle the remaining sublist */
 	if (!list_empty(head))
-		__netif_receive_skb_list_core(head, pfmemalloc);
+		kept += __netif_receive_skb_list_core(head, pfmemalloc);
 	/* Restore pflags */
 	if (pfmemalloc)
 		memalloc_noreclaim_restore(noreclaim_flag);
+	return kept;
 }
 
 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
@@ -5109,17 +5119,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	return ret;
 }
 
-static void netif_receive_skb_list_internal(struct list_head *head)
+static int netif_receive_skb_list_internal(struct list_head *head)
 {
 	struct bpf_prog *xdp_prog = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		net_timestamp_check(netdev_tstamp_prequeue, skb);
 		list_del(&skb->list);
-		if (!skb_defer_rx_timestamp(skb))
+		if (skb_defer_rx_timestamp(skb))
+			kept++;
+		else
 			list_add_tail(&skb->list, &sublist);
 	}
 	list_splice_init(&sublist, head);
@@ -5149,13 +5162,15 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 			if (cpu >= 0) {
 				/* Will be handled, remove from list */
 				list_del(&skb->list);
-				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+				if (enqueue_to_backlog(skb, cpu, &rflow->last_qtail) == NET_RX_SUCCESS)
+					kept++;
 			}
 		}
 	}
 #endif
-	__netif_receive_skb_list(head);
+	kept += __netif_receive_skb_list(head);
 	rcu_read_unlock();
+	return kept;
 }
 
 /**
@@ -5185,21 +5200,21 @@ EXPORT_SYMBOL(netif_receive_skb);
  *	netif_receive_skb_list - process many receive buffers from network
  *	@head: list of skbs to process.
  *
- *	Since return value of netif_receive_skb() is normally ignored, and
- *	wouldn't be meaningful for a list, this function returns void.
+ *	Returns the number of skbs for which netif_receive_skb() would have
+ *	returned %NET_RX_SUCCESS.
  *
  *	This function may only be called from softirq context and interrupts
  *	should be enabled.
  */
-void netif_receive_skb_list(struct list_head *head)
+int netif_receive_skb_list(struct list_head *head)
 {
 	struct sk_buff *skb;
 
 	if (list_empty(head))
-		return;
+		return 0;
 	list_for_each_entry(skb, head, list)
 		trace_netif_receive_skb_list_entry(skb);
-	netif_receive_skb_list_internal(head);
+	return netif_receive_skb_list_internal(head);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..75cc5a6ef9b8 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -526,9 +526,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		       ip_rcv_finish);
 }
 
-static void ip_sublist_rcv_finish(struct list_head *head)
+static int ip_sublist_rcv_finish(struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list) {
 		list_del(&skb->list);
@@ -536,16 +537,19 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 		 * another kind of SKB-list usage (see validate_xmit_skb_list)
 		 */
 		skb->next = NULL;
-		dst_input(skb);
+		if (dst_input(skb) == NET_RX_SUCCESS)
+			kept++;
 	}
+	return kept;
 }
 
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
-			       struct list_head *head)
+static int ip_list_rcv_finish(struct net *net, struct sock *sk,
+			      struct list_head *head)
 {
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -556,8 +560,10 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		 * skb to its handler for processing
 		 */
 		skb = l3mdev_ip_rcv(skb);
-		if (!skb)
+		if (!skb) {
+			kept++;
 			continue;
+		}
 		if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
 			continue;
 
@@ -565,7 +571,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		if (curr_dst != dst) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip_sublist_rcv_finish(&sublist);
+				kept += ip_sublist_rcv_finish(&sublist);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dst = dst;
@@ -573,25 +579,27 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip_sublist_rcv_finish(&sublist);
+	kept += ip_sublist_rcv_finish(&sublist);
+	return kept;
 }
 
-static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
-			   struct net *net)
+static int ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			  struct net *net)
 {
 	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 		     head, dev, NULL, ip_rcv_finish);
-	ip_list_rcv_finish(net, NULL, head);
+	return ip_list_rcv_finish(net, NULL, head);
 }
 
-/* Receive a list of IP packets */
-void ip_list_rcv(struct list_head *head, struct packet_type *pt,
-		 struct net_device *orig_dev)
+/* Receive a list of IP packets; return number of successful receives */
+int ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		struct net_device *orig_dev)
 {
 	struct net_device *curr_dev = NULL;
 	struct net *curr_net = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -606,7 +614,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		if (curr_dev != dev || curr_net != net) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip_sublist_rcv(&sublist, curr_dev, curr_net);
+				kept += ip_sublist_rcv(&sublist, curr_dev, curr_net);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dev = dev;
@@ -615,5 +623,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip_sublist_rcv(&sublist, curr_dev, curr_net);
+	kept += ip_sublist_rcv(&sublist, curr_dev, curr_net);
+	return kept;
 }
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..e64b830c9f0f 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -76,20 +76,24 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	return dst_input(skb);
 }
 
-static void ip6_sublist_rcv_finish(struct list_head *head)
+static int ip6_sublist_rcv_finish(struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list)
-		dst_input(skb);
+		if (dst_input(skb) == NET_RX_SUCCESS)
+			kept++;
+	return kept;
 }
 
-static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
-				struct list_head *head)
+static int ip6_list_rcv_finish(struct net *net, struct sock *sk,
+			       struct list_head *head)
 {
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -100,14 +104,16 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 		 * skb to its handler for processing
 		 */
 		skb = l3mdev_ip6_rcv(skb);
-		if (!skb)
+		if (!skb) {
+			kept++;
 			continue;
+		}
 		ip6_rcv_finish_core(net, sk, skb);
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip6_sublist_rcv_finish(&sublist);
+				kept += ip6_sublist_rcv_finish(&sublist);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dst = dst;
@@ -115,7 +121,8 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip6_sublist_rcv_finish(&sublist);
+	kept += ip6_sublist_rcv_finish(&sublist);
+	return kept;
 }
 
 static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
@@ -273,22 +280,23 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 		       ip6_rcv_finish);
 }
 
-static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
-			    struct net *net)
+static int ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net)
 {
 	NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
 		     head, dev, NULL, ip6_rcv_finish);
-	ip6_list_rcv_finish(net, NULL, head);
+	return ip6_list_rcv_finish(net, NULL, head);
 }
 
 /* Receive a list of IPv6 packets */
-void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
-		   struct net_device *orig_dev)
+int ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+		  struct net_device *orig_dev)
 {
 	struct net_device *curr_dev = NULL;
 	struct net *curr_net = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -303,7 +311,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 		if (curr_dev != dev || curr_net != net) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+				kept += ip6_sublist_rcv(&sublist, curr_dev, curr_net);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dev = dev;
@@ -312,7 +320,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+	kept += ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+	return kept;
 }
 
 /*

^ permalink raw reply related

* [PATCH v2 net-next 4/4] net/core: handle GRO_NORMAL skbs as a list in napi_gro_receive_list
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Allows GRO-using drivers to get the benefits of batching for non-GROable
 traffic.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 net/core/dev.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 69e2819994e4..9a937d2ac83b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5617,6 +5617,7 @@ EXPORT_SYMBOL(napi_gro_receive);
 int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	struct list_head sublist;
 	gro_result_t result;
 	int kept = 0;
 
@@ -5626,14 +5627,26 @@ int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
 		skb_gro_reset_offset(skb);
 	}
 
+	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		list_del(&skb->list);
 		skb->next = NULL;
 		result = dev_gro_receive(napi, skb);
-		result = napi_skb_finish(result, skb);
-		if (result != GRO_DROP)
-			kept++;
+		if (result == GRO_NORMAL) {
+			list_add_tail(&skb->list, &sublist);
+			continue;
+		} else {
+			if (!list_empty(&sublist)) {
+				/* Handle the GRO_NORMAL skbs to prevent OoO */
+				kept += netif_receive_skb_list_internal(&sublist);
+				INIT_LIST_HEAD(&sublist);
+			}
+			result = napi_skb_finish(result, skb);
+			if (result != GRO_DROP)
+				kept++;
+		}
 	}
+	kept += netif_receive_skb_list_internal(&sublist);
 	return kept;
 }
 EXPORT_SYMBOL(napi_gro_receive_list);

^ permalink raw reply related

* Re: [PATCH net] net/ipv6: fix incorrect fib6 gateway info after do redirect
From: David Ahern @ 2018-09-06 14:35 UTC (permalink / raw)
  To: Hangbin Liu, netdev; +Cc: David S. Miller
In-Reply-To: <1536238666-5307-1-git-send-email-liuhangbin@gmail.com>

On 9/6/18 6:57 AM, Hangbin Liu wrote:
> When receive a redirect message and call rt6_do_redirect(), we allocate
> a new rt6_info and set new flags and gateway info, but not update these
> info to fib6_info.
> 
> Then if a user try to get the route info via `ip route get`, he will still
> get the old default gateway, because inet6_rtm_getroute() get gateway info
> from fib6_info.
> 
> Fixes: 23fb93a4d3f11 ("net/ipv6: Cleanup exception and cache route handling")
> Reported-by: Jianlin Shi <jishi@redhat.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  net/ipv6/route.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 18e00ce..3d367c9 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -3446,6 +3446,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
>  		goto out;
>  	}
>  
> +	/* Update fib6_info from rt6_info */
> +	from->fib6_flags = rt->rt6i_flags;
> +	from->fib6_nh.nh_gw = rt->rt6i_gateway;
> +
>  	netevent.old = &rt->dst;
>  	netevent.new = &nrt->dst;
>  	netevent.daddr = &msg->dest;
> 

Only an exception should be inserted - and it is. The original route
should not be updated.

The code prior to the fib6_info did not update the actual FIB entry, and
the IPv4 code does not update the original route.

^ permalink raw reply

* Re: KASAN: slab-out-of-bounds Read in _decode_session6
From: Dmitry Vyukov @ 2018-09-06 19:17 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric Dumazet, syzbot, Alexei Starovoitov, Daniel Borkmann,
	David Miller, Herbert Xu, Alexey Kuznetsov, LKML, netdev,
	Steffen Klassert, syzkaller-bugs, Hideaki YOSHIFUJI
In-Reply-To: <20180906172713.cxjoazoo7asqggb3@ast-mbp.dhcp.thefacebook.com>

On Thu, Sep 6, 2018 at 7:27 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Sep 06, 2018 at 12:00:26AM -0700, Eric Dumazet wrote:
>>
>>
>> On 09/05/2018 08:17 PM, syzbot wrote:
>> > syzbot has found a reproducer for the following crash on:
>> >
>> > HEAD commit:ááá b36fdc6853a3 Merge tag 'gpio-v4.19-2' of git://git.kernel...
>> > git tree:áááááá upstream
>> > console output: https://syzkaller.appspot.com/x/log.txt?x=164938d1400000
>> > kernel config:á https://syzkaller.appspot.com/x/.config?x=4c7e83258d6e0156
>> > dashboard link: https://syzkaller.appspot.com/bug?extid=acffccec848dc13fe459
>> > compiler:áááááá gcc (GCC) 8.0.1 20180413 (experimental)
>> > syz repro:ááááá https://syzkaller.appspot.com/x/repro.syz?x=115f172e400000
>> > C reproducer:áá https://syzkaller.appspot.com/x/repro.c?x=16399be1400000
>> >
>> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
>> > Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com
>> >
>> > IPv6: ADDRCONF(NETDEV_UP): veth1: link is not ready
>> > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
>> > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
>> > 8021q: adding VLAN 0 to HW filter on device team0
>> > ==================================================================
>> > BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>> > Read of size 1 at addr ffff8801d4a67f07 by task syz-executor092/4673
>> >
>> > CPU: 1 PID: 4673 Comm: syz-executor092 Not tainted 4.19.0-rc2+ #223
>> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>> > Call Trace:
>> > á__dump_stack lib/dump_stack.c:77 [inline]
>> > ádump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>> > áprint_address_description+0x6c/0x20b mm/kasan/report.c:256
>> > ákasan_report_error mm/kasan/report.c:354 [inline]
>> > ákasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>> > á__asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
>> > á_decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>> > á__xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299
>> > áxfrm_decode_session include/net/xfrm.h:1232 [inline]
>> > ávti6_tnl_xmit+0x3fc/0x1bb1 net/ipv6/ip6_vti.c:542
>> > á__netdev_start_xmit include/linux/netdevice.h:4287 [inline]
>> > ánetdev_start_xmit include/linux/netdevice.h:4296 [inline]
>> > áxmit_one net/core/dev.c:3216 [inline]
>> > ádev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3232
>> > á__dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3802
>> > ádev_queue_xmit+0x17/0x20 net/core/dev.c:3835
>> > á__bpf_tx_skb net/core/filter.c:2012 [inline]
>> > á__bpf_redirect_common net/core/filter.c:2050 [inline]
>> > á__bpf_redirect+0x5b7/0xae0 net/core/filter.c:2057
>> > á____bpf_clone_redirect net/core/filter.c:2090 [inline]
>> > ábpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2062
>> > ábpf_prog_c39d1ba309a769f7+0xe9e/0x1000
>> >
>> > Allocated by task 4673:
>> > ásave_stack+0x43/0xd0 mm/kasan/kasan.c:448
>> > áset_track mm/kasan/kasan.c:460 [inline]
>> > ákasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
>> > á__do_kmalloc_node mm/slab.c:3682 [inline]
>> > á__kmalloc_node_track_caller+0x47/0x70 mm/slab.c:3696
>> > á__kmalloc_reserve.isra.41+0x3a/0xe0 net/core/skbuff.c:137
>> > ápskb_expand_head+0x230/0x10e0 net/core/skbuff.c:1463
>> > áskb_ensure_writable+0x3dd/0x640 net/core/skbuff.c:5129
>> > á__bpf_try_make_writable net/core/filter.c:1633 [inline]
>> > ábpf_try_make_writable net/core/filter.c:1639 [inline]
>> > ábpf_try_make_head_writable net/core/filter.c:1647 [inline]
>> > á____bpf_clone_redirect net/core/filter.c:2084 [inline]
>> > ábpf_clone_redirect+0x14a/0x490 net/core/filter.c:2062
>> > ábpf_prog_c39d1ba309a769f7+0xe9e/0x1000
>> >
>> > Freed by task 3286:
>> > ásave_stack+0x43/0xd0 mm/kasan/kasan.c:448
>> > áset_track mm/kasan/kasan.c:460 [inline]
>> > á__kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
>> > ákasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>> > á__cache_free mm/slab.c:3498 [inline]
>> > ákfree+0xd9/0x210 mm/slab.c:3813
>> > áload_elf_binary+0x2569/0x5610 fs/binfmt_elf.c:1118
>> > ásearch_binary_handler+0x17d/0x570 fs/exec.c:1653
>> > áexec_binprm fs/exec.c:1695 [inline]
>> > á__do_execve_file.isra.35+0x15ff/0x2460 fs/exec.c:1819
>> > ádo_execveat_common fs/exec.c:1866 [inline]
>> > ádo_execve fs/exec.c:1883 [inline]
>> > á__do_sys_execve fs/exec.c:1964 [inline]
>> > á__se_sys_execve fs/exec.c:1959 [inline]
>> > á__x64_sys_execve+0x8f/0xc0 fs/exec.c:1959
>> > ádo_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>> > áentry_SYSCALL_64_after_hwframe+0x49/0xbe
>> >
>> > The buggy address belongs to the object at ffff8801d4a67d00
>> > áwhich belongs to the cache kmalloc-512 of size 512
>> > The buggy address is located 7 bytes to the right of
>> > á512-byte region [ffff8801d4a67d00, ffff8801d4a67f00)
>> > The buggy address belongs to the page:
>> > page:ffffea00075299c0 count:1 mapcount:0 mapping:ffff8801dac00940 index:0x0
>> > flags: 0x2fffc0000000100(slab)
>> > raw: 02fffc0000000100 ffffea0007529988 ffffea0007529a48 ffff8801dac00940
>> > raw: 0000000000000000 ffff8801d4a67080 0000000100000006 0000000000000000
>> > page dumped because: kasan: bad access detected
>> >
>> > Memory state around the buggy address:
>> > áffff8801d4a67e00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> > áffff8801d4a67e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> >> ffff8801d4a67f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> > áááááááááááááááááá ^
>> > áffff8801d4a67f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> > áffff8801d4a68000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> > ==================================================================
>> >
>>
>>
>> What about :
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index aecdeba052d3f0ff3d4f0a33ec36891f9738052c..a662f59786bd0677850c1c60a2c92faa6fb6c5bb 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2081,7 +2081,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
>>          * here, we need to free the just generated clone to unclone once
>>          * again.
>>          */
>> -       ret = bpf_try_make_head_writable(skb);
>> +       ret = bpf_try_make_head_writable(clone);
>
> This part is fine. I think the bug is in _decode_session6,

Eric, you arrived to roughly the same conclusion, right?

> but I have a hard time reproducing the issue, so will appreciate
> if somebody can test the following patch:

syzbot can:
https://github.com/google/syzkaller/blob/master/docs/syzbot.md#testing-patches


> From 291f80f212461670d1e0140d06eee3071cf3e1ee Mon Sep 17 00:00:00 2001
> From: Alexei Starovoitov <ast@kernel.org>
> Date: Thu, 6 Sep 2018 10:23:29 -0700
> Subject: [PATCH] net/xfrm: fix out-of-bounds packet access
>
> BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0
> net/ipv6/xfrm6_policy.c:161
> Read of size 1 at addr ffff8801d882eec7 by task syz-executor1/6667
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>   print_address_description+0x6c/0x20b mm/kasan/report.c:256
>   kasan_report_error mm/kasan/report.c:354 [inline]
>   kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>   __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
>   _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>   __xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299
>   xfrm_decode_session include/net/xfrm.h:1232 [inline]
>   vti6_tnl_xmit+0x3c3/0x1bc1 net/ipv6/ip6_vti.c:542
>   __netdev_start_xmit include/linux/netdevice.h:4313 [inline]
>   netdev_start_xmit include/linux/netdevice.h:4322 [inline]
>   xmit_one net/core/dev.c:3217 [inline]
>   dev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3233
>   __dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3803
>   dev_queue_xmit+0x17/0x20 net/core/dev.c:3836
>
> Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  net/ipv6/xfrm6_policy.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
> index ef3defaf43b9..d35bcf92969c 100644
> --- a/net/ipv6/xfrm6_policy.c
> +++ b/net/ipv6/xfrm6_policy.c
> @@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
>         fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
>         fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
>
> -       while (nh + offset + 1 < skb->data ||
> -              pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
> +       while (nh + offset + sizeof(*exthdr) < skb->data ||
> +              pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
>                 nh = skb_network_header(skb);
>                 exthdr = (struct ipv6_opt_hdr *)(nh + offset);
>
> --
> 2.17.1
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/20180906172713.cxjoazoo7asqggb3%40ast-mbp.dhcp.thefacebook.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* [PATCH] net/sock: move memory_allocated over to percpu_counter variables
From: Olof Johansson @ 2018-09-06 19:20 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: Neil Horman, Marcelo Ricardo Leitner, Vlad Yasevich, Herbert Xu,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, linux-crypto, linux-kernel,
	linux-sctp, netdev, linux-decnet-user, kernel-team,
	Olof Johansson

Today these are all global shared variables per protocol, and in
particular tcp_memory_allocated can get hot on a system with
large number of CPUs and a substantial number of connections.

Moving it over to a per-cpu variable makes it significantly cheaper,
and the added overhead when summing up the percpu copies is still smaller
than the cost of having a hot cacheline bouncing around.

Signed-off-by: Olof Johansson <olof@lixom.net>
---
 crypto/af_alg.c         | 10 ++++++++--
 include/net/sctp/sctp.h |  3 ++-
 include/net/sock.h      | 12 ++++++------
 include/net/tcp.h       |  2 +-
 include/net/udp.h       |  2 +-
 net/core/sock.c         |  5 ++++-
 net/decnet/af_decnet.c  |  3 ++-
 net/ipv4/tcp.c          |  3 ++-
 net/ipv4/udp.c          |  4 +++-
 net/sctp/protocol.c     |  6 ++++++
 net/sctp/socket.c       |  2 +-
 11 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index b053179e0bc5..1fd75a709d7b 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -29,7 +29,7 @@ struct alg_type_list {
 	struct list_head list;
 };
 
-static atomic_long_t alg_memory_allocated;
+static struct percpu_counter alg_memory_allocated;
 
 static struct proto alg_proto = {
 	.name			= "ALG",
@@ -1183,13 +1183,19 @@ static int __init af_alg_init(void)
 	if (err)
 		goto out;
 
-	err = sock_register(&alg_family);
+	err = percpu_counter_init(&alg_memory_allocated, 0, GFP_KERNEL);
 	if (err != 0)
 		goto out_unregister_proto;
 
+	err = sock_register(&alg_family);
+	if (err != 0)
+		goto out_free_percpu;
+
 out:
 	return err;
 
+out_free_percpu:
+	percpu_counter_destroy(&alg_memory_allocated);
 out_unregister_proto:
 	proto_unregister(&alg_proto);
 	goto out;
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa370e0f..270579cf310b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -36,7 +36,7 @@
  *    Sridhar Samudrala     <sri@us.ibm.com>
  *    Ardelle Fan           <ardelle.fan@intel.com>
  *    Ryan Layer            <rmlayer@us.ibm.com>
- *    Kevin Gao             <kevin.gao@intel.com> 
+ *    Kevin Gao             <kevin.gao@intel.com>
  */
 
 #ifndef __net_sctp_h__
@@ -114,6 +114,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock,
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
+extern struct percpu_counter sctp_memory_allocated;
 extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
diff --git a/include/net/sock.h b/include/net/sock.h
index 433f45fc2d68..45aed5e84b5d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,7 +1114,7 @@ struct proto {
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
-	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
 	 * Pressure flag: try to collapse.
@@ -1237,19 +1237,19 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
-	return atomic_long_read(sk->sk_prot->memory_allocated);
+	return percpu_counter_sum_positive(sk->sk_prot->memory_allocated);
 }
 
-static inline long
+static inline void
 sk_memory_allocated_add(struct sock *sk, int amt)
 {
-	return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
+	percpu_counter_add(sk->sk_prot->memory_allocated, amt);
 }
 
 static inline void
 sk_memory_allocated_sub(struct sock *sk, int amt)
 {
-	atomic_long_sub(amt, sk->sk_prot->memory_allocated);
+	percpu_counter_sub(sk->sk_prot->memory_allocated, amt);
 }
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
@@ -1277,7 +1277,7 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
 static inline long
 proto_memory_allocated(struct proto *prot)
 {
-	return atomic_long_read(prot->memory_allocated);
+	return percpu_counter_sum_positive(prot->memory_allocated);
 }
 
 static inline bool
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 770917d0caa7..2df1754cf3ab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -248,7 +248,7 @@ extern long sysctl_tcp_mem[3];
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
 #define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
-extern atomic_long_t tcp_memory_allocated;
+extern struct percpu_counter tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern unsigned long tcp_memory_pressure;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..9e0d9f7091a0 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -97,7 +97,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
-extern atomic_long_t udp_memory_allocated;
+extern struct percpu_counter udp_memory_allocated;
 
 /* sysctl variables for udp */
 extern long sysctl_udp_mem[3];
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..0a755f6c8942 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2394,9 +2394,12 @@ EXPORT_SYMBOL(sk_wait_data);
 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	struct proto *prot = sk->sk_prot;
-	long allocated = sk_memory_allocated_add(sk, amt);
+	long allocated;
 	bool charged = true;
 
+	sk_memory_allocated_add(sk, amt);
+	allocated = sk_memory_allocated(sk);
+
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
 		goto suppress_allocation;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 7d6ff983ba2c..f88af9ae4474 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -156,7 +156,7 @@ static const struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
-static atomic_long_t decnet_memory_allocated;
+static struct percpu_counter decnet_memory_allocated;
 
 static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
 static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
@@ -2356,6 +2356,7 @@ static int __init decnet_init(void)
 	int rc;
 
 	printk(banner);
+	percpu_counter_init(&decnet_memory_allocated, 0, GFP_KERNEL);
 
 	rc = proto_register(&dn_proto, 1);
 	if (rc != 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c4235c098fd..eb6531ba6bd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(tcp_orphan_count);
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_mem);
 
-atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
+struct percpu_counter tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
 #if IS_ENABLED(CONFIG_SMC)
@@ -3834,6 +3834,7 @@ void __init tcp_init(void)
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
 		     FIELD_SIZEOF(struct sk_buff, cb));
 
+	percpu_counter_init(&tcp_memory_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f4e35b2ff8b8..6ec5d2f68ae7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(udp_table);
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-atomic_long_t udp_memory_allocated;
+struct percpu_counter udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
 #define MAX_UDP_PORTS 65536
@@ -2923,6 +2923,8 @@ void __init udp_init(void)
 
 	__udp_sysctl_init(&init_net);
 
+	percpu_counter_init(&udp_memory_allocated, 0, GFP_KERNEL);
+
 	/* 16 spinlocks per cpu */
 	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
 	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e948db29ab53..ca59ca0dc740 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1391,6 +1391,10 @@ static __init int sctp_init(void)
 	if (!sctp_chunk_cachep)
 		goto err_chunk_cachep;
 
+	status = percpu_counter_init(&sctp_memory_allocated, 0, GFP_KERNEL);
+	if (status)
+		goto err_percpu_memory_init;
+
 	status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
 	if (status)
 		goto err_percpu_counter_init;
@@ -1559,6 +1563,8 @@ static __init int sctp_init(void)
 err_ehash_alloc:
 	percpu_counter_destroy(&sctp_sockets_allocated);
 err_percpu_counter_init:
+	percpu_counter_destroy(&sctp_memory_allocated);
+err_percpu_memory_init:
 	kmem_cache_destroy(sctp_chunk_cachep);
 err_chunk_cachep:
 	kmem_cache_destroy(sctp_bucket_cachep);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d38d5ba..60d55573baa5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -107,7 +107,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 			      enum sctp_socket_type type);
 
 static unsigned long sctp_memory_pressure;
-static atomic_long_t sctp_memory_allocated;
+struct percpu_counter sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
 static void sctp_enter_memory_pressure(struct sock *sk)
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net-next] qed*: Utilize FW 8.37.7.0
From: David Miller @ 2018-09-06 14:44 UTC (permalink / raw)
  To: denis.bolotin; +Cc: netdev, ariel.elior
In-Reply-To: <20180905153555.2661-1-denis.bolotin@cavium.com>

From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Wed, 5 Sep 2018 18:35:55 +0300

> This patch adds a new qed firmware with fixes and support for new features.
> 
> Fixes:
> - Fix a rare case of device crash with iWARP, iSCSI or FCoE offload.
> - Fix GRE tunneled traffic when iWARP offload is enabled.
> - Fix RoCE failure in ib_send_bw when using inline data.
> - Fix latency optimization flow for inline WQEs.
> - BigBear 100G fix
> 
> RDMA:
> - Reduce task context size.
> - Application page sizes above 2GB support.
> - Performance improvements.
> 
> ETH:
> - Tenant DCB support.
> - Replace RSS indirection table update interface.
> 
> Misc:
> - Debug Tools changes.
> 
> Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
> Signed-off-by: Ariel Elior <ariel.elior@cavium.com>

Applied, thanks.

^ permalink raw reply

* [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
From: Arseny Maslennikov @ 2018-09-06 14:51 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev
In-Reply-To: <20180906145112.29245-1-ar@cs.msu.ru>

Some tools may currently be using only the deprecated attribute;
let's print an elaborate and clear deprecation notice to kmsg.

To do that, we have to replace the whole sysfs file, since we inherit
the original one from netdev.

Signed-off-by: Arseny Maslennikov <ar@cs.msu.ru>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 30f840f874b3..74732726ec6f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 	return device_create_file(&dev->dev, &dev_attr_pkey);
 }
 
+/*
+ * We erroneously exposed the iface's port number in the dev_id
+ * sysfs field long after dev_port was introduced for that purpose[1],
+ * and we need to stop everyone from relying on that.
+ * Let's overload the shower routine for the dev_id file here
+ * to gently bring the issue up.
+ *
+ * [1] https://www.spinics.net/lists/netdev/msg272123.html
+ */
+static ssize_t dev_id_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct net_device *ndev = to_net_dev(dev);
+
+	if (ndev->dev_id == ndev->dev_port)
+		netdev_info_once(ndev,
+			"\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
+			current->comm);
+
+	return sprintf(buf, "%#x\n", ndev->dev_id);
+}
+static DEVICE_ATTR_RO(dev_id);
+
+int ipoib_intercept_dev_id_attr(struct net_device *dev)
+{
+	device_remove_file(&dev->dev, &dev_attr_dev_id);
+	return device_create_file(&dev->dev, &dev_attr_dev_id);
+}
+
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
@@ -2427,6 +2456,8 @@ static struct net_device *ipoib_add_port(const char *format,
 	 */
 	ndev->priv_destructor = ipoib_intf_free;
 
+	if (ipoib_intercept_dev_id_attr(ndev))
+		goto sysfs_failed;
 	if (ipoib_cm_add_mode_attr(ndev))
 		goto sysfs_failed;
 	if (ipoib_add_pkey_attr(ndev))
-- 
2.19.0.rc2

^ permalink raw reply related

* [PATCH v4 0/3] IB/ipoib: Use dev_port to disambiguate port numbers
From: Arseny Maslennikov @ 2018-09-06 14:51 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev

Pre-3.15 userspace had trouble distinguishing different ports
of a NIC on a single PCI bus/device/function. To solve this,
a sysfs field `dev_port' was introduced quite a while ago
(commit v3.14-rc3-739-g3f85944fe207), and some relevant device
drivers were fixed to use it, but not in case of IPoIB.

The convention for some reason never got documented in the kernel, but
was immediately adopted by userspace (notably udev[1][2], biosdevname[3])

1/3 documents the sysfs field — that's why I'm CC-ing netdev.

This series was tested on and applies to 4.19-rc2.

[1] https://lists.freedesktop.org/archives/systemd-devel/2014-June/020788.html
[2] https://lists.freedesktop.org/archives/systemd-devel/2014-July/020804.html
[3] https://github.com/CloudAutomationNTools/biosdevname/blob/c795d51dd93a5309652f0d635f12a3ecfabfaa72/src/eths.c#L38

v1->v2: replace a line instead of inserting and then removing.
v2->v3: restore both attributes, output a notice of deprecation to kmsg.
v3->v4: style adjustments, join the deprecation notice to single line.

Arseny Maslennikov (3):
  Documentation/ABI: document /sys/class/net/*/dev_port
  IB/ipoib: Use dev_port to expose network interface port numbers
  IB/ipoib: Log sysfs 'dev_id' accesses from userspace

 Documentation/ABI/testing/sysfs-class-net | 18 +++++++++++++
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 33 +++++++++++++++++++++++
 2 files changed, 51 insertions(+)

-- 
2.19.0.rc2

^ permalink raw reply

* [PATCH v4 1/3] Documentation/ABI: document /sys/class/net/*/dev_port
From: Arseny Maslennikov @ 2018-09-06 14:51 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev
In-Reply-To: <20180906145112.29245-1-ar@cs.msu.ru>

The sysfs field was introduced 4 years ago along with fixes to various
drivers that erroneously used `dev_id' for that purpose, but it was not
properly documented anywhere.
See commit v3.14-rc3-739-g3f85944fe207.

Signed-off-by: Arseny Maslennikov <ar@cs.msu.ru>
---
 Documentation/ABI/testing/sysfs-class-net | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 2f1788111cd9..ec2232f6a949 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -91,6 +91,24 @@ Description:
 		stacked (e.g: VLAN interfaces) but still have the same MAC
 		address as their parent device.
 
+What:		/sys/class/net/<iface>/dev_port
+Date:		February 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		Indicates the port number of this network device, formatted
+		as a decimal value. Some NICs have multiple independent ports
+		on the same PCI bus, device and function. This attribute allows
+		userspace to distinguish the respective interfaces.
+
+		Note: some device drivers started to use 'dev_id' for this
+		purpose since long before 3.15 and have not adopted the new
+		attribute ever since. To query the port number, some tools look
+		exclusively at 'dev_port', while others only consult 'dev_id'.
+		If a network device has multiple client adapter ports as
+		described in the previous paragraph and does not set this
+		attribute to its port number, it's a kernel bug.
+
 What:		/sys/class/net/<iface>/dormant
 Date:		March 2006
 KernelVersion:	2.6.17
-- 
2.19.0.rc2

^ permalink raw reply related

* [PATCH v4 2/3] IB/ipoib: Use dev_port to expose network interface port numbers
From: Arseny Maslennikov @ 2018-09-06 14:51 UTC (permalink / raw)
  To: linux-rdma; +Cc: Arseny Maslennikov, Doug Ledford, Jason Gunthorpe, netdev
In-Reply-To: <20180906145112.29245-1-ar@cs.msu.ru>

Some InfiniBand network devices have multiple ports on the same PCI
function. This initializes the `dev_port' sysfs field of those
network interfaces with their port number.

Prior to this the kernel erroneously used the `dev_id' sysfs
field of those network interfaces to convey the port number to userspace.

The use of `dev_id' was considered correct until Linux 3.15,
when another field, `dev_port', was defined for this particular
purpose and `dev_id' was reserved for distinguishing stacked ifaces
(e.g: VLANs) with the same hardware address as their parent device.

Similar fixes to net/mlx4_en and many other drivers, which started
exporting this information through `dev_id' before 3.15, were accepted
into the kernel 4 years ago.
See 76a066f2a2a0 (`net/mlx4_en: Expose port number through sysfs').

Signed-off-by: Arseny Maslennikov <ar@cs.msu.ru>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e3d28f9ad9c0..30f840f874b3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1880,6 +1880,8 @@ static int ipoib_parent_init(struct net_device *ndev)
 	       sizeof(union ib_gid));

 	SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
+	priv->dev->dev_port = priv->port - 1;
+	/* Let's set this one too for backwards compatibility. */
 	priv->dev->dev_id = priv->port - 1;

 	return 0;
-- 
2.19.0.rc2

^ permalink raw reply related

* Re: [PATCH net-next v3 0/5] net: dsa: b53: SerDes support
From: David Miller @ 2018-09-06 14:51 UTC (permalink / raw)
  To: f.fainelli; +Cc: netdev, andrew, vivien.didelot
In-Reply-To: <20180905194215.29301-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed,  5 Sep 2018 12:42:10 -0700

> This patch series adds support for the SerDes found on NorthStar Plus
> (NSP) which allows us to use the SFP port on the BCM958625HR board (and
> other similar designs).
> 
> Changes in v3:
> 
> - properly hunk the request_threaded_irq() bits into patch #2
> 
> Changes in v2:
> 
> - migrate to threaded interrupt (Andrew)
> - fixed a case where MLO_AN_FIXED's mac_config would still call into
>   the serdes_config callback
> - added an additional check on the phylink interface in mac_config
> - default to ARCH_BCM_NSP instead of ARCH_BCM_IPROC which is really
>   the NSP Kconfig bit we want

Series applied, thanks Florian.

^ permalink raw reply

* Re: [pull request][net 00/10] Mellanox, mlx5 fixes 2018-09-05
From: David Miller @ 2018-09-06 14:57 UTC (permalink / raw)
  To: saeedm; +Cc: netdev
In-Reply-To: <20180906040952.29684-1-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed,  5 Sep 2018 21:09:42 -0700

> This pull request contains some fixes for mlx5 etherent netdevice and
> core driver.

Pulled.

> 
> For -stable v4.9:
> ('net/mlx5: Fix debugfs cleanup in the device init/remove flow')
> 
> For -stable v4.12:
> ("net/mlx5: E-Switch, Fix memory leak when creating switchdev mode FDB tables")
> 
> For -stable v4.13:
> ("net/mlx5: Fix use-after-free in self-healing flow")
> 
> For -stable v4.14:
> ("net/mlx5: Check for error in mlx5_attach_interface")
> 
> For -stable v4.15:
> ("net/mlx5: Fix not releasing read lock when adding flow rules")
> 
> For -stable v4.17:
> ("net/mlx5: Fix possible deadlock from lockdep when adding fte to fg")
> 
> For -stable v4.18:
> ("net/mlx5: Use u16 for Work Queue buffer fragment size")

And will queue these up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH] net/sock: move memory_allocated over to percpu_counter variables
From: Eric Dumazet @ 2018-09-06 19:33 UTC (permalink / raw)
  To: Olof Johansson
  Cc: David Miller, Neil Horman, Marcelo Ricardo Leitner,
	Vladislav Yasevich, Herbert Xu, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, linux-crypto, LKML, linux-sctp, netdev,
	linux-decnet-user, kernel-team
In-Reply-To: <20180906192034.8467-1-olof@lixom.net>

On Thu, Sep 6, 2018 at 12:21 PM Olof Johansson <olof@lixom.net> wrote:
>
> Today these are all global shared variables per protocol, and in
> particular tcp_memory_allocated can get hot on a system with
> large number of CPUs and a substantial number of connections.
>
> Moving it over to a per-cpu variable makes it significantly cheaper,
> and the added overhead when summing up the percpu copies is still smaller
> than the cost of having a hot cacheline bouncing around.

I am curious. We never noticed contention on this variable, at least for TCP.

Please share some numbers with us.

^ permalink raw reply

* Adding path to the photos
From: Leo Young @ 2018-09-06 10:41 UTC (permalink / raw)
  To: netdev

We provide image editing such as: image cutting out, retouching, masking
etc.

Here are the details what we can provide to your photos.
Jewelry retouching for your photos
Fashion retouching for your photos
Cutting out for your photos
Clipping path for your photos
Deep etch process for your photos
Image masking for your photos
Portrait retouching for your photos

We provide test editing for your photos.
let us know if interested.

Thanks,
Leo Young

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox