Netdev List
 help / color / mirror / Atom feed
* [PATCHv4 iproute2 2/2] lib/libnetlink: update rtnl_talk to support malloc buff at run time
From: Hangbin Liu @ 2017-09-28 13:33 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Michal Kubecek, Phil Sutter, Hangbin Liu
In-Reply-To: <1506605626-1744-1-git-send-email-haliu@redhat.com>

From: Hangbin Liu <liuhangbin@gmail.com>

This is an update for 460c03f3f3cc ("iplink: double the buffer size also in
iplink_get()"). After update, we will not need to double the buffer size
every time when VFs number increased.

With call like rtnl_talk(&rth, &req.n, NULL, 0), we can simply remove the
length parameter.

With call like rtnl_talk(&rth, nlh, nlh, sizeof(req), I add a new variable
answer to avoid overwrite data in nlh, because it may has more info after
nlh. also this will avoid nlh buffer not enough issue.

We need to free answer after using.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 bridge/fdb.c         |  2 +-
 bridge/link.c        |  2 +-
 bridge/mdb.c         |  2 +-
 bridge/vlan.c        |  2 +-
 genl/ctrl.c          | 19 ++++++++++++-------
 include/libnetlink.h |  6 +++---
 ip/ipaddress.c       |  4 ++--
 ip/ipaddrlabel.c     |  4 ++--
 ip/ipfou.c           |  4 ++--
 ip/ipila.c           |  4 ++--
 ip/ipl2tp.c          |  8 ++++----
 ip/iplink.c          | 38 +++++++++++++++++++-------------------
 ip/iplink_vrf.c      | 44 ++++++++++++++++++++------------------------
 ip/ipmacsec.c        |  2 +-
 ip/ipneigh.c         |  2 +-
 ip/ipnetns.c         | 23 ++++++++++++++---------
 ip/ipntable.c        |  2 +-
 ip/iproute.c         | 26 +++++++++++++++++---------
 ip/iprule.c          |  6 +++---
 ip/ipseg6.c          |  8 +++++---
 ip/iptoken.c         |  2 +-
 ip/link_gre.c        | 11 +++++++----
 ip/link_gre6.c       | 11 +++++++----
 ip/link_ip6tnl.c     | 11 +++++++----
 ip/link_iptnl.c      | 10 ++++++----
 ip/link_vti.c        | 11 +++++++----
 ip/link_vti6.c       | 11 +++++++----
 ip/tcp_metrics.c     |  8 +++++---
 ip/xfrm_policy.c     | 25 +++++++++++++------------
 ip/xfrm_state.c      | 30 ++++++++++++++++--------------
 lib/libgenl.c        |  9 +++++++--
 lib/libnetlink.c     | 24 +++++++++++-------------
 misc/ss.c            |  2 +-
 tc/m_action.c        | 12 ++++++------
 tc/tc_class.c        |  2 +-
 tc/tc_filter.c       |  8 +++++---
 tc/tc_qdisc.c        |  2 +-
 37 files changed, 220 insertions(+), 177 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index e5cebf9..807914f 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -535,7 +535,7 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
 		return -1;
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -1;
 
 	return 0;
diff --git a/bridge/link.c b/bridge/link.c
index 93472ad..cc29a2a 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -426,7 +426,7 @@ static int brlink_modify(int argc, char **argv)
 		addattr_nest_end(&req.n, nest);
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -1;
 
 	return 0;
diff --git a/bridge/mdb.c b/bridge/mdb.c
index 748091b..f38e326 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -440,7 +440,7 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
 	entry.vid = vid;
 	addattr_l(&req.n, sizeof(req), MDBA_SET_ENTRY, &entry, sizeof(entry));
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -1;
 
 	return 0;
diff --git a/bridge/vlan.c b/bridge/vlan.c
index ebcdace..5d68359 100644
--- a/bridge/vlan.c
+++ b/bridge/vlan.c
@@ -133,7 +133,7 @@ static int vlan_modify(int cmd, int argc, char **argv)
 
 	addattr_nest_end(&req.n, afspec);
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -1;
 
 	return 0;
diff --git a/genl/ctrl.c b/genl/ctrl.c
index 448988e..a6d31b0 100644
--- a/genl/ctrl.c
+++ b/genl/ctrl.c
@@ -55,6 +55,7 @@ int genl_ctrl_resolve_family(const char *family)
 	};
 	struct nlmsghdr *nlh = &req.n;
 	struct genlmsghdr *ghdr = &req.g;
+	struct nlmsghdr *answer = NULL;
 
 	if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0) {
 		fprintf(stderr, "Cannot open generic netlink socket\n");
@@ -63,19 +64,19 @@ int genl_ctrl_resolve_family(const char *family)
 
 	addattr_l(nlh, 128, CTRL_ATTR_FAMILY_NAME, family, strlen(family) + 1);
 
-	if (rtnl_talk(&rth, nlh, nlh, sizeof(req)) < 0) {
+	if (rtnl_talk(&rth, nlh, &answer) < 0) {
 		fprintf(stderr, "Error talking to the kernel\n");
 		goto errout;
 	}
 
 	{
 		struct rtattr *tb[CTRL_ATTR_MAX + 1];
-		int len = nlh->nlmsg_len;
+		int len = answer->nlmsg_len;
 		struct rtattr *attrs;
 
-		if (nlh->nlmsg_type !=  GENL_ID_CTRL) {
+		if (answer->nlmsg_type !=  GENL_ID_CTRL) {
 			fprintf(stderr, "Not a controller message, nlmsg_len=%d "
-				"nlmsg_type=0x%x\n", nlh->nlmsg_len, nlh->nlmsg_type);
+				"nlmsg_type=0x%x\n", answer->nlmsg_len, answer->nlmsg_type);
 			goto errout;
 		}
 
@@ -88,10 +89,11 @@ int genl_ctrl_resolve_family(const char *family)
 
 		if (len < 0) {
 			fprintf(stderr, "wrong controller message len %d\n", len);
+			free(answer);
 			return -1;
 		}
 
-		attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
+		attrs = (struct rtattr *) ((char *) answer + NLMSG_LENGTH(GENL_HDRLEN));
 		parse_rtattr(tb, CTRL_ATTR_MAX, attrs, len);
 
 		if (tb[CTRL_ATTR_FAMILY_ID] == NULL) {
@@ -103,6 +105,7 @@ int genl_ctrl_resolve_family(const char *family)
 	}
 
 errout:
+	free(answer);
 	rtnl_close(&rth);
 	return ret;
 }
@@ -299,6 +302,7 @@ static int ctrl_list(int cmd, int argc, char **argv)
 		.g.cmd = CTRL_CMD_GETFAMILY,
 	};
 	struct nlmsghdr *nlh = &req.n;
+	struct nlmsghdr *answer = NULL;
 
 	if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0) {
 		fprintf(stderr, "Cannot open generic netlink socket\n");
@@ -331,12 +335,12 @@ static int ctrl_list(int cmd, int argc, char **argv)
 			goto ctrl_done;
 		}
 
-		if (rtnl_talk(&rth, nlh, nlh, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, nlh, &answer) < 0) {
 			fprintf(stderr, "Error talking to the kernel\n");
 			goto ctrl_done;
 		}
 
-		if (print_ctrl2(NULL, nlh, (void *) stdout) < 0) {
+		if (print_ctrl2(NULL, answer, (void *) stdout) < 0) {
 			fprintf(stderr, "Dump terminated\n");
 			goto ctrl_done;
 		}
@@ -358,6 +362,7 @@ static int ctrl_list(int cmd, int argc, char **argv)
 
 	ret = 0;
 ctrl_done:
+	free(answer);
 	rtnl_close(&rth);
 	return ret;
 }
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 69257f0..77b6260 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -93,13 +93,13 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 #define rtnl_dump_filter(rth, filter, arg) \
 	rtnl_dump_filter_nc(rth, filter, arg, 0)
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t len)
+	      struct nlmsghdr **answer)
 	__attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t len, nl_ext_ack_fn_t errfn)
+	      struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
 	__attribute__((warn_unused_result));
 int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-				   struct nlmsghdr *answer, size_t len)
+				   struct nlmsghdr **answer)
 	__attribute__((warn_unused_result));
 int rtnl_send(struct rtnl_handle *rth, const void *buf, int)
 	__attribute__((warn_unused_result));
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 9797145..019dd1e 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1824,7 +1824,7 @@ static int restore_handler(const struct sockaddr_nl *nl,
 
 	ll_init_map(&rth);
 
-	ret = rtnl_talk(&rth, n, n, sizeof(*n));
+	ret = rtnl_talk(&rth, n, NULL);
 	if ((ret < 0) && (errno == EEXIST))
 		ret = 0;
 
@@ -2520,7 +2520,7 @@ static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 		return -1;
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/ipaddrlabel.c b/ip/ipaddrlabel.c
index 1d324da..6ea9bff 100644
--- a/ip/ipaddrlabel.c
+++ b/ip/ipaddrlabel.c
@@ -176,7 +176,7 @@ static int ipaddrlabel_modify(int cmd, int argc, char **argv)
 	if (req.ifal.ifal_family == AF_UNSPEC)
 		req.ifal.ifal_family = AF_INET6;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -203,7 +203,7 @@ static int flush_addrlabel(const struct sockaddr_nl *who, struct nlmsghdr *n, vo
 		if (rtnl_open(&rth2, 0) < 0)
 			return -1;
 
-		if (rtnl_talk(&rth2, n, NULL, 0) < 0)
+		if (rtnl_talk(&rth2, n, NULL) < 0)
 			return -2;
 
 		rtnl_close(&rth2);
diff --git a/ip/ipfou.c b/ip/ipfou.c
index 00dbe15..23000dc 100644
--- a/ip/ipfou.c
+++ b/ip/ipfou.c
@@ -116,7 +116,7 @@ static int do_add(int argc, char **argv)
 
 	fou_parse_opt(argc, argv, &req.n, true);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -128,7 +128,7 @@ static int do_del(int argc, char **argv)
 
 	fou_parse_opt(argc, argv, &req.n, false);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/ipila.c b/ip/ipila.c
index 843cc16..0403fc4 100644
--- a/ip/ipila.c
+++ b/ip/ipila.c
@@ -220,7 +220,7 @@ static int do_add(int argc, char **argv)
 
 	ila_parse_opt(argc, argv, &req.n, true);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -232,7 +232,7 @@ static int do_del(int argc, char **argv)
 
 	ila_parse_opt(argc, argv, &req.n, false);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/ipl2tp.c b/ip/ipl2tp.c
index 88664c9..742adbe 100644
--- a/ip/ipl2tp.c
+++ b/ip/ipl2tp.c
@@ -129,7 +129,7 @@ static int create_tunnel(struct l2tp_parm *p)
 			addattr(&req.n, 1024, L2TP_ATTR_UDP_ZERO_CSUM6_RX);
 	}
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -142,7 +142,7 @@ static int delete_tunnel(struct l2tp_parm *p)
 
 	addattr32(&req.n, 128, L2TP_ATTR_CONN_ID, p->tunnel_id);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -185,7 +185,7 @@ static int create_session(struct l2tp_parm *p)
 	if (p->ifname && p->ifname[0])
 		addattrstrz(&req.n, 1024, L2TP_ATTR_IFNAME, p->ifname);
 
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -198,7 +198,7 @@ static int delete_session(struct l2tp_parm *p)
 
 	addattr32(&req.n, 1024, L2TP_ATTR_CONN_ID, p->tunnel_id);
 	addattr32(&req.n, 1024, L2TP_ATTR_SESSION_ID, p->session_id);
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/iplink.c b/ip/iplink.c
index ff5b56c..21a22cd 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -249,19 +249,26 @@ static int nl_get_ll_addr_len(unsigned int dev_index)
 			.ifi_index = dev_index,
 		}
 	};
+	struct nlmsghdr *answer;
 	struct rtattr *tb[IFLA_MAX+1];
 
-	if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0)
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		return -1;
 
-	len = req.n.nlmsg_len - NLMSG_LENGTH(sizeof(req.i));
-	if (len < 0)
+	len = answer->nlmsg_len - NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	if (len < 0) {
+		free(answer);
 		return -1;
+	}
 
-	parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(&req.i), len, NLA_F_NESTED);
-	if (!tb[IFLA_ADDRESS])
+	parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)),
+			   len, NLA_F_NESTED);
+	if (!tb[IFLA_ADDRESS]) {
+		free(answer);
 		return -1;
+	}
 
+	free(answer);
 	return RTA_PAYLOAD(tb[IFLA_ADDRESS]);
 }
 
@@ -913,7 +920,7 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv)
 
 			req.i.ifi_index = 0;
 			addattr32(&req.n, sizeof(req), IFLA_GROUP, group);
-			if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+			if (rtnl_talk(&rth, &req.n, NULL) < 0)
 				return -2;
 			return 0;
 		}
@@ -1008,7 +1015,7 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv)
 		return -1;
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -1023,10 +1030,7 @@ int iplink_get(unsigned int flags, char *name, __u32 filt_mask)
 		.n.nlmsg_type = RTM_GETLINK,
 		.i.ifi_family = preferred_family,
 	};
-	struct {
-		struct nlmsghdr n;
-		char buf[32768];
-	} answer;
+	struct nlmsghdr *answer;
 
 	if (name) {
 		len = strlen(name) + 1;
@@ -1039,21 +1043,17 @@ int iplink_get(unsigned int flags, char *name, __u32 filt_mask)
 	}
 	addattr32(&req.n, sizeof(req), IFLA_EXT_MASK, filt_mask);
 
-	if (rtnl_talk(&rth, &req.n, &answer.n, sizeof(answer)) < 0)
-		return -2;
-	if (answer.n.nlmsg_len > sizeof(answer.buf)) {
-		fprintf(stderr, "Message truncated from %u to %lu\n",
-			answer.n.nlmsg_len, sizeof(answer.buf));
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		return -2;
-	}
 
 	open_json_object(NULL);
 	if (brief)
-		print_linkinfo_brief(NULL, &answer.n, stdout, NULL);
+		print_linkinfo_brief(NULL, answer, stdout, NULL);
 	else
-		print_linkinfo(NULL, &answer.n, stdout);
+		print_linkinfo(NULL, answer, stdout);
 	close_json_object();
 
+	free(answer);
 	return 0;
 }
 
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
index 7a1bb5e..e9dd0df 100644
--- a/ip/iplink_vrf.c
+++ b/ip/iplink_vrf.c
@@ -119,10 +119,7 @@ __u32 ipvrf_get_table(const char *name)
 			.ifi_family  = preferred_family,
 		},
 	};
-	struct {
-		struct nlmsghdr n;
-		char buf[8192];
-	} answer;
+	struct nlmsghdr *answer;
 	struct rtattr *tb[IFLA_MAX+1];
 	struct rtattr *li[IFLA_INFO_MAX+1];
 	struct rtattr *vrf_attr[IFLA_VRF_MAX + 1];
@@ -132,8 +129,7 @@ __u32 ipvrf_get_table(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
-					   &answer.n, sizeof(answer)) < 0) {
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n, &answer) < 0) {
 		/* special case "default" vrf to be the main table */
 		if (errno == ENODEV && !strcmp(name, "default"))
 			if (rtnl_rttable_a2n(&tb_id, "main"))
@@ -143,25 +139,25 @@ __u32 ipvrf_get_table(const char *name)
 		return tb_id;
 	}
 
-	ifi = NLMSG_DATA(&answer.n);
-	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	ifi = NLMSG_DATA(answer);
+	len = answer->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	if (len < 0) {
 		fprintf(stderr, "BUG: Invalid response to link query.\n");
-		return 0;
+		goto out;
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
 
 	if (!tb[IFLA_LINKINFO])
-		return 0;
+		goto out;
 
 	parse_rtattr_nested(li, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
 
 	if (!li[IFLA_INFO_KIND] || !li[IFLA_INFO_DATA])
-		return 0;
+		goto out;
 
 	if (strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf"))
-		return 0;
+		goto out;
 
 	parse_rtattr_nested(vrf_attr, IFLA_VRF_MAX, li[IFLA_INFO_DATA]);
 	if (vrf_attr[IFLA_VRF_TABLE])
@@ -170,6 +166,8 @@ __u32 ipvrf_get_table(const char *name)
 	if (!tb_id)
 		fprintf(stderr, "BUG: VRF %s is missing table id\n", name);
 
+out:
+	free(answer);
 	return tb_id;
 }
 
@@ -189,10 +187,7 @@ int name_is_vrf(const char *name)
 			.ifi_family  = preferred_family,
 		},
 	};
-	struct {
-		struct nlmsghdr n;
-		char buf[8192];
-	} answer;
+	struct nlmsghdr *answer;
 	struct rtattr *tb[IFLA_MAX+1];
 	struct rtattr *li[IFLA_INFO_MAX+1];
 	struct ifinfomsg *ifi;
@@ -200,29 +195,30 @@ int name_is_vrf(const char *name)
 
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, name, strlen(name) + 1);
 
-	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n,
-					   &answer.n, sizeof(answer)) < 0)
+	if (rtnl_talk_suppress_rtnl_errmsg(&rth, &req.n, &answer) < 0)
 		return 0;
 
-	ifi = NLMSG_DATA(&answer.n);
-	len = answer.n.nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	ifi = NLMSG_DATA(answer);
+	len = answer->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	if (len < 0) {
 		fprintf(stderr, "BUG: Invalid response to link query.\n");
-		return 0;
+		goto out;
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
 
 	if (!tb[IFLA_LINKINFO])
-		return 0;
+		goto out;
 
 	parse_rtattr_nested(li, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
 
 	if (!li[IFLA_INFO_KIND])
-		return 0;
+		goto out;
 
 	if (strcmp(RTA_DATA(li[IFLA_INFO_KIND]), "vrf"))
-		return 0;
+		goto out;
 
+out:
+	free(answer);
 	return ifi->ifi_index;
 }
diff --git a/ip/ipmacsec.c b/ip/ipmacsec.c
index ecc371a..345a707 100644
--- a/ip/ipmacsec.c
+++ b/ip/ipmacsec.c
@@ -421,7 +421,7 @@ static int do_modify_nl(enum cmd c, enum macsec_nl_commands cmd, int ifindex,
 	addattr_nest_end(&req.n, attr_sa);
 
 talk:
-	if (rtnl_talk(&genl_rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&genl_rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/ipneigh.c b/ip/ipneigh.c
index 9c38a60..32f2d55 100644
--- a/ip/ipneigh.c
+++ b/ip/ipneigh.c
@@ -184,7 +184,7 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv)
 		return -1;
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	return 0;
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index afb4978..bc70997 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -95,12 +95,13 @@ static int get_netnsid_from_name(const char *name)
 		struct nlmsghdr n;
 		struct rtgenmsg g;
 		char            buf[1024];
-	} answer, req = {
+	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
 		.n.nlmsg_type = RTM_GETNSID,
 		.g.rtgen_family = AF_UNSPEC,
 	};
+	struct nlmsghdr *answer;
 	struct rtattr *tb[NETNSA_MAX + 1];
 	struct rtgenmsg *rthdr;
 	int len, fd;
@@ -110,26 +111,30 @@ static int get_netnsid_from_name(const char *name)
 		return fd;
 
 	addattr32(&req.n, 1024, NETNSA_FD, fd);
-	if (rtnl_talk(&rtnsh, &req.n, &answer.n, sizeof(answer)) < 0) {
+	if (rtnl_talk(&rtnsh, &req.n, &answer) < 0) {
 		close(fd);
 		return -2;
 	}
 	close(fd);
 
 	/* Validate message and parse attributes */
-	if (answer.n.nlmsg_type == NLMSG_ERROR)
-		return -1;
+	if (answer->nlmsg_type == NLMSG_ERROR)
+		goto err_out;
 
-	rthdr = NLMSG_DATA(&answer.n);
-	len = answer.n.nlmsg_len - NLMSG_SPACE(sizeof(*rthdr));
+	rthdr = NLMSG_DATA(answer);
+	len = answer->nlmsg_len - NLMSG_SPACE(sizeof(*rthdr));
 	if (len < 0)
-		return -1;
+		goto err_out;
 
 	parse_rtattr(tb, NETNSA_MAX, NETNS_RTA(rthdr), len);
 
-	if (tb[NETNSA_NSID])
+	if (tb[NETNSA_NSID]) {
+		free(answer);
 		return rta_getattr_u32(tb[NETNSA_NSID]);
+	}
 
+err_out:
+	free(answer);
 	return -1;
 }
 
@@ -689,7 +694,7 @@ static int set_netnsid_from_name(const char *name, int nsid)
 
 	addattr32(&req.n, 1024, NETNSA_FD, fd);
 	addattr32(&req.n, 1024, NETNSA_NSID, nsid);
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		err = -2;
 
 	close(fd);
diff --git a/ip/ipntable.c b/ip/ipntable.c
index 88236ce..2f72c98 100644
--- a/ip/ipntable.c
+++ b/ip/ipntable.c
@@ -304,7 +304,7 @@ static int ipntable_modify(int cmd, int flags, int argc, char **argv)
 			  RTA_PAYLOAD(parms_rta));
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	return 0;
diff --git a/ip/iproute.c b/ip/iproute.c
index a8733f4..e45a5f2 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1300,7 +1300,7 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 	if (!type_ok && req.r.rtm_family == AF_MPLS)
 		req.r.rtm_type = RTN_UNICAST;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
@@ -1680,6 +1680,7 @@ static int iproute_get(int argc, char **argv)
 	};
 	char  *idev = NULL;
 	char  *odev = NULL;
+	struct nlmsghdr *answer;
 	int connected = 0;
 	int fib_match = 0;
 	int from_ok = 0;
@@ -1800,26 +1801,29 @@ static int iproute_get(int argc, char **argv)
 	if (fib_match)
 		req.r.rtm_flags |= RTM_F_FIB_MATCH;
 
-	if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0)
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		return -2;
 
 	if (connected && !from_ok) {
-		struct rtmsg *r = NLMSG_DATA(&req.n);
-		int len = req.n.nlmsg_len;
+		struct rtmsg *r = NLMSG_DATA(answer);
+		int len = answer->nlmsg_len;
 		struct rtattr *tb[RTA_MAX+1];
 
-		if (print_route(NULL, &req.n, (void *)stdout) < 0) {
+		if (print_route(NULL, answer, (void *)stdout) < 0) {
 			fprintf(stderr, "An error :-)\n");
+			free(answer);
 			return -1;
 		}
 
-		if (req.n.nlmsg_type != RTM_NEWROUTE) {
+		if (answer->nlmsg_type != RTM_NEWROUTE) {
 			fprintf(stderr, "Not a route?\n");
+			free(answer);
 			return -1;
 		}
 		len -= NLMSG_LENGTH(sizeof(*r));
 		if (len < 0) {
 			fprintf(stderr, "Wrong len %d\n", len);
+			free(answer);
 			return -1;
 		}
 
@@ -1830,6 +1834,7 @@ static int iproute_get(int argc, char **argv)
 			r->rtm_src_len = 8*RTA_PAYLOAD(tb[RTA_PREFSRC]);
 		} else if (!tb[RTA_SRC]) {
 			fprintf(stderr, "Failed to connect the route\n");
+			free(answer);
 			return -1;
 		}
 		if (!odev && tb[RTA_OIF])
@@ -1843,15 +1848,18 @@ static int iproute_get(int argc, char **argv)
 		req.n.nlmsg_flags = NLM_F_REQUEST;
 		req.n.nlmsg_type = RTM_GETROUTE;
 
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0)
+		free(answer);
+		if (rtnl_talk(&rth, &req.n, &answer) < 0)
 			return -2;
 	}
 
-	if (print_route(NULL, &req.n, (void *)stdout) < 0) {
+	if (print_route(NULL, answer, (void *)stdout) < 0) {
 		fprintf(stderr, "An error :-)\n");
+		free(answer);
 		return -1;
 	}
 
+	free(answer);
 	return 0;
 }
 
@@ -1895,7 +1903,7 @@ restore:
 
 	ll_init_map(&rth);
 
-	ret = rtnl_talk(&rth, n, n, sizeof(*n));
+	ret = rtnl_talk(&rth, n, NULL);
 	if ((ret < 0) && (errno == EEXIST))
 		ret = 0;
 
diff --git a/ip/iprule.c b/ip/iprule.c
index 8313138..e64b4d7 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c
@@ -393,7 +393,7 @@ static int flush_rule(const struct sockaddr_nl *who, struct nlmsghdr *n,
 		if (rtnl_open(&rth2, 0) < 0)
 			return -1;
 
-		if (rtnl_talk(&rth2, n, NULL, 0) < 0)
+		if (rtnl_talk(&rth2, n, NULL) < 0)
 			return -2;
 
 		rtnl_close(&rth2);
@@ -553,7 +553,7 @@ static int restore_handler(const struct sockaddr_nl *nl,
 
 	ll_init_map(&rth);
 
-	ret = rtnl_talk(&rth, n, n, sizeof(*n));
+	ret = rtnl_talk(&rth, n, NULL);
 	if ((ret < 0) && (errno == EEXIST))
 		ret = 0;
 
@@ -760,7 +760,7 @@ static int iprule_modify(int cmd, int argc, char **argv)
 	if (!table_ok && cmd == RTM_NEWRULE)
 		req.r.rtm_table = RT_TABLE_MAIN;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/ipseg6.c b/ip/ipseg6.c
index a8f5c69..461a3c1 100644
--- a/ip/ipseg6.c
+++ b/ip/ipseg6.c
@@ -125,6 +125,7 @@ static int process_msg(const struct sockaddr_nl *who, struct nlmsghdr *n,
 static int seg6_do_cmd(void)
 {
 	SEG6_REQUEST(req, 1024, opts.cmd, NLM_F_REQUEST);
+	struct nlmsghdr *answer;
 	int repl = 0, dump = 0;
 
 	if (genl_family < 0) {
@@ -163,15 +164,16 @@ static int seg6_do_cmd(void)
 	}
 
 	if (!repl && !dump) {
-		if (rtnl_talk(&grth, &req.n, NULL, 0) < 0)
+		if (rtnl_talk(&grth, &req.n, NULL) < 0)
 			return -1;
 	} else if (repl) {
-		if (rtnl_talk(&grth, &req.n, &req.n, sizeof(req)) < 0)
+		if (rtnl_talk(&grth, &req.n, &answer) < 0)
 			return -2;
-		if (process_msg(NULL, &req.n, stdout) < 0) {
+		if (process_msg(NULL, answer, stdout) < 0) {
 			fprintf(stderr, "Error parsing reply\n");
 			exit(1);
 		}
+		free(answer);
 	} else {
 		req.n.nlmsg_flags |= NLM_F_DUMP;
 		req.n.nlmsg_seq = grth.dump = ++grth.seq;
diff --git a/ip/iptoken.c b/ip/iptoken.c
index 1869f76..0528bad 100644
--- a/ip/iptoken.c
+++ b/ip/iptoken.c
@@ -166,7 +166,7 @@ static int iptoken_set(int argc, char **argv, bool delete)
 	addattr_nest_end(&req.n, afs6);
 	addattr_nest_end(&req.n, afs);
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return -2;
 
 	return 0;
diff --git a/ip/link_gre.c b/ip/link_gre.c
index 9ea2970..35782ca 100644
--- a/ip/link_gre.c
+++ b/ip/link_gre.c
@@ -68,7 +68,6 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[16384];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -76,6 +75,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *greinfo[IFLA_GRE_MAX + 1];
@@ -100,19 +100,20 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u32 erspan_idx = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -177,6 +178,8 @@ get_failed:
 
 		if (greinfo[IFLA_GRE_ERSPAN_INDEX])
 			erspan_idx = rta_getattr_u32(greinfo[IFLA_GRE_ERSPAN_INDEX]);
+
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/link_gre6.c b/ip/link_gre6.c
index 7d07932..2eedec8 100644
--- a/ip/link_gre6.c
+++ b/ip/link_gre6.c
@@ -78,7 +78,6 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[1024];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -86,6 +85,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *greinfo[IFLA_GRE_MAX + 1];
@@ -108,19 +108,20 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u32 fwmark = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -180,6 +181,8 @@ get_failed:
 
 		if (greinfo[IFLA_GRE_FWMARK])
 			fwmark = rta_getattr_u32(greinfo[IFLA_GRE_FWMARK]);
+
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/link_ip6tnl.c b/ip/link_ip6tnl.c
index a419900..2f8c3f3 100644
--- a/ip/link_ip6tnl.c
+++ b/ip/link_ip6tnl.c
@@ -75,7 +75,6 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[2048];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -83,6 +82,7 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *iptuninfo[IFLA_IPTUN_MAX + 1];
@@ -103,19 +103,20 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u32 fwmark = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -158,6 +159,8 @@ get_failed:
 
 		if (iptuninfo[IFLA_IPTUN_FWMARK])
 			fwmark = rta_getattr_u32(iptuninfo[IFLA_IPTUN_FWMARK]);
+
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/link_iptnl.c b/ip/link_iptnl.c
index 6a725e9..4940b8b 100644
--- a/ip/link_iptnl.c
+++ b/ip/link_iptnl.c
@@ -76,7 +76,6 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[2048];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -84,6 +83,7 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *iptuninfo[IFLA_IPTUN_MAX + 1];
@@ -108,19 +108,20 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u32 fwmark = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -188,6 +189,7 @@ get_failed:
 		if (iptuninfo[IFLA_IPTUN_FWMARK])
 			fwmark = rta_getattr_u32(iptuninfo[IFLA_IPTUN_FWMARK]);
 
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/link_vti.c b/ip/link_vti.c
index 8bd4d90..07ac94e 100644
--- a/ip/link_vti.c
+++ b/ip/link_vti.c
@@ -53,7 +53,6 @@ static int vti_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[1024];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -61,6 +60,7 @@ static int vti_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *vtiinfo[IFLA_VTI_MAX + 1];
@@ -73,19 +73,20 @@ static int vti_parse_opt(struct link_util *lu, int argc, char **argv,
 	int len;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -115,6 +116,8 @@ get_failed:
 
 		if (vtiinfo[IFLA_VTI_FWMARK])
 			fwmark = rta_getattr_u32(vtiinfo[IFLA_VTI_FWMARK]);
+
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/link_vti6.c b/ip/link_vti6.c
index 8198d46..6d08bfe 100644
--- a/ip/link_vti6.c
+++ b/ip/link_vti6.c
@@ -48,7 +48,6 @@ static int vti6_parse_opt(struct link_util *lu, int argc, char **argv,
 	struct {
 		struct nlmsghdr n;
 		struct ifinfomsg i;
-		char buf[1024];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
@@ -56,6 +55,7 @@ static int vti6_parse_opt(struct link_util *lu, int argc, char **argv,
 		.i.ifi_family = preferred_family,
 		.i.ifi_index = ifi->ifi_index,
 	};
+	struct nlmsghdr *answer = NULL;
 	struct rtattr *tb[IFLA_MAX + 1];
 	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
 	struct rtattr *vtiinfo[IFLA_VTI_MAX + 1];
@@ -68,19 +68,20 @@ static int vti6_parse_opt(struct link_util *lu, int argc, char **argv,
 	int len;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
-		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
+		if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 get_failed:
 			fprintf(stderr,
 				"Failed to get existing tunnel info.\n");
+			free(answer);
 			return -1;
 		}
 
-		len = req.n.nlmsg_len;
+		len = answer->nlmsg_len;
 		len -= NLMSG_LENGTH(sizeof(*ifi));
 		if (len < 0)
 			goto get_failed;
 
-		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(NLMSG_DATA(answer)), len);
 
 		if (!tb[IFLA_LINKINFO])
 			goto get_failed;
@@ -110,6 +111,8 @@ get_failed:
 
 		if (vtiinfo[IFLA_VTI_FWMARK])
 			fwmark = rta_getattr_u32(vtiinfo[IFLA_VTI_FWMARK]);
+
+		free(answer);
 	}
 
 	while (argc > 0) {
diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
index 8972acd..3f9790e 100644
--- a/ip/tcp_metrics.c
+++ b/ip/tcp_metrics.c
@@ -306,6 +306,7 @@ static int process_msg(const struct sockaddr_nl *who, struct nlmsghdr *n,
 static int tcpm_do_cmd(int cmd, int argc, char **argv)
 {
 	TCPM_REQUEST(req, 1024, TCP_METRICS_CMD_GET, NLM_F_REQUEST);
+	struct nlmsghdr *answer;
 	int atype = -1, stype = -1;
 	int ack;
 
@@ -457,15 +458,16 @@ static int tcpm_do_cmd(int cmd, int argc, char **argv)
 	}
 
 	if (ack) {
-		if (rtnl_talk(&grth, &req.n, NULL, 0) < 0)
+		if (rtnl_talk(&grth, &req.n, NULL) < 0)
 			return -2;
 	} else if (atype >= 0) {
-		if (rtnl_talk(&grth, &req.n, &req.n, sizeof(req)) < 0)
+		if (rtnl_talk(&grth, &req.n, &answer) < 0)
 			return -2;
-		if (process_msg(NULL, &req.n, stdout) < 0) {
+		if (process_msg(NULL, answer, stdout) < 0) {
 			fprintf(stderr, "Dump terminated\n");
 			exit(1);
 		}
+		free(answer);
 	} else {
 		req.n.nlmsg_seq = grth.dump = ++grth.seq;
 		if (rtnl_send(&grth, &req, req.n.nlmsg_len) < 0) {
diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c
index de689c4..98460a0 100644
--- a/ip/xfrm_policy.c
+++ b/ip/xfrm_policy.c
@@ -386,7 +386,7 @@ static int xfrm_policy_modify(int cmd, unsigned int flags, int argc, char **argv
 	if (req.xpinfo.sel.family == AF_UNSPEC)
 		req.xpinfo.sel.family = AF_INET;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
@@ -548,7 +548,7 @@ int xfrm_policy_print(const struct sockaddr_nl *who, struct nlmsghdr *n,
 }
 
 static int xfrm_policy_get_or_delete(int argc, char **argv, int delete,
-				     void *res_nlbuf, size_t res_size)
+				     struct nlmsghdr **answer)
 {
 	struct rtnl_handle rth;
 	struct {
@@ -659,7 +659,7 @@ static int xfrm_policy_get_or_delete(int argc, char **argv, int delete,
 			  (void *)&ctx, ctx.sctx.len);
 	}
 
-	if (rtnl_talk(&rth, &req.n, res_nlbuf, res_size) < 0)
+	if (rtnl_talk(&rth, &req.n, answer) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
@@ -669,21 +669,21 @@ static int xfrm_policy_get_or_delete(int argc, char **argv, int delete,
 
 static int xfrm_policy_delete(int argc, char **argv)
 {
-	return xfrm_policy_get_or_delete(argc, argv, 1, NULL, 0);
+	return xfrm_policy_get_or_delete(argc, argv, 1, NULL);
 }
 
 static int xfrm_policy_get(int argc, char **argv)
 {
-	char buf[NLMSG_BUF_SIZE] = {};
-	struct nlmsghdr *n = (struct nlmsghdr *)buf;
+	struct nlmsghdr *n = NULL;
 
-	xfrm_policy_get_or_delete(argc, argv, 0, n, sizeof(buf));
+	xfrm_policy_get_or_delete(argc, argv, 0, &n);
 
 	if (xfrm_policy_print(NULL, n, (void *)stdout) < 0) {
 		fprintf(stderr, "An error :-)\n");
 		exit(1);
 	}
 
+	free(n);
 	return 0;
 }
 
@@ -1049,7 +1049,7 @@ static int xfrm_spd_setinfo(int argc, char **argv)
 	if (rtnl_open_byproto(&rth, 0, NETLINK_XFRM) < 0)
 		exit(1);
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
@@ -1063,22 +1063,23 @@ static int xfrm_spd_getinfo(int argc, char **argv)
 	struct {
 		struct nlmsghdr			n;
 		__u32				flags;
-		char				ans[128];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(__u32)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
 		.n.nlmsg_type = XFRM_MSG_GETSPDINFO,
 		.flags = 0XFFFFFFFF,
 	};
+	struct nlmsghdr *answer;
 
 	if (rtnl_open_byproto(&rth, 0, NETLINK_XFRM) < 0)
 		exit(1);
 
-	if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0)
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		exit(2);
 
-	print_spdinfo(&req.n, (void *)stdout);
+	print_spdinfo(answer, (void *)stdout);
 
+	free(answer);
 	rtnl_close(&rth);
 
 	return 0;
@@ -1123,7 +1124,7 @@ static int xfrm_policy_flush(int argc, char **argv)
 	if (show_stats > 1)
 		fprintf(stderr, "Flush policy\n");
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c
index 4483fb8..a3d4fd7 100644
--- a/ip/xfrm_state.c
+++ b/ip/xfrm_state.c
@@ -726,7 +726,7 @@ static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char **argv)
 	if (req.xsinfo.family == AF_UNSPEC)
 		req.xsinfo.family = AF_INET;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
@@ -757,8 +757,7 @@ static int xfrm_state_allocspi(int argc, char **argv)
 	char *minp = NULL;
 	char *maxp = NULL;
 	struct xfrm_mark mark = {0, 0};
-	char res_buf[NLMSG_BUF_SIZE] = {};
-	struct nlmsghdr *res_n = (struct nlmsghdr *)res_buf;
+	struct nlmsghdr *answer;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "mode") == 0) {
@@ -858,14 +857,15 @@ static int xfrm_state_allocspi(int argc, char **argv)
 		req.xspi.info.family = AF_INET;
 
 
-	if (rtnl_talk(&rth, &req.n, res_n, sizeof(res_buf)) < 0)
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		exit(2);
 
-	if (xfrm_state_print(NULL, res_n, (void *)stdout) < 0) {
+	if (xfrm_state_print(NULL, answer, (void *)stdout) < 0) {
 		fprintf(stderr, "An error :-)\n");
 		exit(1);
 	}
 
+	free(answer);
 	rtnl_close(&rth);
 
 	return 0;
@@ -1046,19 +1046,20 @@ static int xfrm_state_get_or_delete(int argc, char **argv, int delete)
 		req.xsid.family = AF_INET;
 
 	if (delete) {
-		if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+		if (rtnl_talk(&rth, &req.n, NULL) < 0)
 			exit(2);
 	} else {
-		char buf[NLMSG_BUF_SIZE] = {};
-		struct nlmsghdr *res_n = (struct nlmsghdr *)buf;
+		struct nlmsghdr *answer;
 
-		if (rtnl_talk(&rth, &req.n, res_n, sizeof(req)) < 0)
+		if (rtnl_talk(&rth, &req.n, &answer) < 0)
 			exit(2);
 
-		if (xfrm_state_print(NULL, res_n, (void *)stdout) < 0) {
+		if (xfrm_state_print(NULL, answer, (void *)stdout) < 0) {
 			fprintf(stderr, "An error :-)\n");
 			exit(1);
 		}
+
+		free(answer);
 	}
 
 	rtnl_close(&rth);
@@ -1314,22 +1315,23 @@ static int xfrm_sad_getinfo(int argc, char **argv)
 	struct {
 		struct nlmsghdr			n;
 		__u32				flags;
-		char				ans[64];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.flags)),
 		.n.nlmsg_flags = NLM_F_REQUEST,
 		.n.nlmsg_type = XFRM_MSG_GETSADINFO,
 		.flags = 0XFFFFFFFF,
 	};
+	struct nlmsghdr *answer;
 
 	if (rtnl_open_byproto(&rth, 0, NETLINK_XFRM) < 0)
 		exit(1);
 
-	if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0)
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
 		exit(2);
 
-	print_sadinfo(&req.n, (void *)stdout);
+	print_sadinfo(answer, (void *)stdout);
 
+	free(answer);
 	rtnl_close(&rth);
 
 	return 0;
@@ -1376,7 +1378,7 @@ static int xfrm_state_flush(int argc, char **argv)
 		fprintf(stderr, "Flush state with XFRM-PROTO value \"%s\"\n",
 			strxf_xfrmproto(req.xsf.proto));
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		exit(2);
 
 	rtnl_close(&rth);
diff --git a/lib/libgenl.c b/lib/libgenl.c
index 50d2d92..bb5fbb5 100644
--- a/lib/libgenl.c
+++ b/lib/libgenl.c
@@ -49,16 +49,21 @@ int genl_resolve_family(struct rtnl_handle *grth, const char *family)
 {
 	GENL_REQUEST(req, 1024, GENL_ID_CTRL, 0, 0, CTRL_CMD_GETFAMILY,
 		     NLM_F_REQUEST);
+	struct nlmsghdr *answer;
+	int fnum;
 
 	addattr_l(&req.n, sizeof(req), CTRL_ATTR_FAMILY_NAME,
 		  family, strlen(family) + 1);
 
-	if (rtnl_talk(grth, &req.n, &req.n, sizeof(req)) < 0) {
+	if (rtnl_talk(grth, &req.n, &answer) < 0) {
 		fprintf(stderr, "Error talking to the kernel\n");
 		return -2;
 	}
 
-	return genl_parse_getfamily(&req.n);
+	fnum = genl_parse_getfamily(answer);
+	free(answer);
+
+	return fnum;
 }
 
 int genl_init_handle(struct rtnl_handle *grth, const char *family,
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 1847c0b..4618dc0 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -577,7 +577,7 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
 }
 
 static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-		       struct nlmsghdr *answer, size_t maxlen,
+		       struct nlmsghdr **answer,
 		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
 	int status;
@@ -651,9 +651,9 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					fprintf(stderr, "ERROR truncated\n");
 				} else if (!err->error) {
 					if (answer)
-						memcpy(answer, h,
-						       MIN(maxlen, h->nlmsg_len));
-					free(buf);
+						*answer = (struct nlmsghdr *)buf;
+					else
+						free(buf);
 					return 0;
 				}
 
@@ -667,9 +667,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 			}
 
 			if (answer) {
-				memcpy(answer, h,
-				       MIN(maxlen, h->nlmsg_len));
-				free(buf);
+				*answer = (struct nlmsghdr *)buf;
 				return 0;
 			}
 
@@ -693,22 +691,22 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 }
 
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-	      struct nlmsghdr *answer, size_t maxlen)
+	      struct nlmsghdr **answer)
 {
-	return __rtnl_talk(rtnl, n, answer, maxlen, true, NULL);
+	return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-		     struct nlmsghdr *answer, size_t maxlen,
+		     struct nlmsghdr **answer,
 		     nl_ext_ack_fn_t errfn)
 {
-	return __rtnl_talk(rtnl, n, answer, maxlen, true, errfn);
+	return __rtnl_talk(rtnl, n, answer, true, errfn);
 }
 
 int rtnl_talk_suppress_rtnl_errmsg(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-				   struct nlmsghdr *answer, size_t maxlen)
+				   struct nlmsghdr **answer)
 {
-	return __rtnl_talk(rtnl, n, answer, maxlen, false, NULL);
+	return __rtnl_talk(rtnl, n, answer, false, NULL);
 }
 
 int rtnl_listen_all_nsid(struct rtnl_handle *rth)
diff --git a/misc/ss.c b/misc/ss.c
index dd8dfaa..bf21d80 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2597,7 +2597,7 @@ static int kill_inet_sock(struct nlmsghdr *h, void *arg, struct sockstat *s)
 		raw->sdiag_raw_protocol = s->raw_prot;
 	}
 
-	return rtnl_talk(rth, &req.nlh, NULL, 0);
+	return rtnl_talk(rth, &req.nlh, NULL);
 }
 
 static int show_one_inet_sock(const struct sockaddr_nl *addr,
diff --git a/tc/m_action.c b/tc/m_action.c
index 402228b..8cbf764 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -518,18 +518,18 @@ static int tc_action_gd(int cmd, unsigned int flags, int *argc_p, char ***argv_p
 	tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
 
 	req.n.nlmsg_seq = rth.dump = ++rth.seq;
-	if (cmd == RTM_GETACTION)
-		ans = &req.n;
 
-	if (rtnl_talk(&rth, &req.n, ans, MAX_MSG) < 0) {
+	if (rtnl_talk(&rth, &req.n, &ans) < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		return 1;
 	}
 
-	if (ans && print_action(NULL, &req.n, (void *)stdout) < 0) {
+	if (cmd == RTM_GETACTION && print_action(NULL, ans, stdout) < 0) {
 		fprintf(stderr, "Dump terminated\n");
+		free(ans);
 		return 1;
 	}
+	free(ans);
 
 	*argc_p = argc;
 	*argv_p = argv;
@@ -562,7 +562,7 @@ static int tc_action_modify(int cmd, unsigned int flags, int *argc_p, char ***ar
 	}
 	tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) {
+	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		ret = -1;
 	}
@@ -653,7 +653,7 @@ static int tc_act_list_or_flush(int *argc_p, char ***argv_p, int event)
 		req.n.nlmsg_type = RTM_DELACTION;
 		req.n.nlmsg_flags |= NLM_F_ROOT;
 		req.n.nlmsg_flags |= NLM_F_REQUEST;
-		if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) {
+		if (rtnl_talk(&rth, &req.n, NULL) < 0) {
 			fprintf(stderr, "We have an error flushing\n");
 			return 1;
 		}
diff --git a/tc/tc_class.c b/tc/tc_class.c
index 1a1f1fa..0214775 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -149,7 +149,7 @@ static int tc_class_modify(int cmd, unsigned int flags, int argc, char **argv)
 		}
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return 2;
 
 	return 0;
diff --git a/tc/tc_filter.c b/tc/tc_filter.c
index cf290ae..8dbebf1 100644
--- a/tc/tc_filter.c
+++ b/tc/tc_filter.c
@@ -194,7 +194,7 @@ static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
 		}
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) {
+	if (rtnl_talk(&rth, &req.n, NULL) < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		return 2;
 	}
@@ -331,6 +331,7 @@ static int tc_filter_get(int cmd, unsigned int flags, int argc, char **argv)
 		.t.tcm_parent = TC_H_UNSPEC,
 		.t.tcm_family = AF_UNSPEC,
 	};
+	struct nlmsghdr *answer;
 	struct filter_util *q = NULL;
 	__u32 prio = 0;
 	__u32 protocol = 0;
@@ -484,13 +485,14 @@ static int tc_filter_get(int cmd, unsigned int flags, int argc, char **argv)
 		return -1;
 	}
 
-	if (rtnl_talk(&rth, &req.n, &req.n, MAX_MSG) < 0) {
+	if (rtnl_talk(&rth, &req.n, &answer) < 0) {
 		fprintf(stderr, "We have an error talking to the kernel\n");
 		return 2;
 	}
 
-	print_filter(NULL, &req.n, (void *)stdout);
+	print_filter(NULL, answer, (void *)stdout);
 
+	free(answer);
 	return 0;
 }
 
diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index 1e9d909..c52114a 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -190,7 +190,7 @@ static int tc_qdisc_modify(int cmd, unsigned int flags, int argc, char **argv)
 		req.t.tcm_ifindex = idx;
 	}
 
-	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
 		return 2;
 
 	return 0;
-- 
2.5.5

^ permalink raw reply related

* [PATCH net] l2tp: fix l2tp_eth module loading
From: Guillaume Nault @ 2017-09-28 13:44 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Tom Parkin

The l2tp_eth module crashes if its netlink callbacks are run when the
pernet data aren't initialised.

We should normally register_pernet_device() before the genl callbacks.
However, the pernet data only maintain a list of l2tpeth interfaces,
and this list is never used. So let's just drop pernet handling
instead.

Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_eth.c | 51 ++-------------------------------------------------
 1 file changed, 2 insertions(+), 49 deletions(-)

diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 87da9ef61860..014a7bc2a872 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -44,7 +44,6 @@ struct l2tp_eth {
 	struct net_device	*dev;
 	struct sock		*tunnel_sock;
 	struct l2tp_session	*session;
-	struct list_head	list;
 	atomic_long_t		tx_bytes;
 	atomic_long_t		tx_packets;
 	atomic_long_t		tx_dropped;
@@ -58,17 +57,6 @@ struct l2tp_eth_sess {
 	struct net_device	*dev;
 };
 
-/* per-net private data for this module */
-static unsigned int l2tp_eth_net_id;
-struct l2tp_eth_net {
-	struct list_head l2tp_eth_dev_list;
-	spinlock_t l2tp_eth_lock;
-};
-
-static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
-{
-	return net_generic(net, l2tp_eth_net_id);
-}
 
 static int l2tp_eth_dev_init(struct net_device *dev)
 {
@@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev)
 
 static void l2tp_eth_dev_uninit(struct net_device *dev)
 {
-	struct l2tp_eth *priv = netdev_priv(dev);
-	struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev));
-
-	spin_lock(&pn->l2tp_eth_lock);
-	list_del_init(&priv->list);
-	spin_unlock(&pn->l2tp_eth_lock);
 	dev_put(dev);
 }
 
@@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
 	struct l2tp_eth *priv;
 	struct l2tp_eth_sess *spriv;
 	int rc;
-	struct l2tp_eth_net *pn;
 
 	if (cfg->ifname) {
 		strlcpy(name, cfg->ifname, IFNAMSIZ);
@@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
 	priv = netdev_priv(dev);
 	priv->dev = dev;
 	priv->session = session;
-	INIT_LIST_HEAD(&priv->list);
 
 	priv->tunnel_sock = tunnel->sock;
 	session->recv_skb = l2tp_eth_dev_recv;
@@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
 	strlcpy(session->ifname, dev->name, IFNAMSIZ);
 
 	dev_hold(dev);
-	pn = l2tp_eth_pernet(dev_net(dev));
-	spin_lock(&pn->l2tp_eth_lock);
-	list_add(&priv->list, &pn->l2tp_eth_dev_list);
-	spin_unlock(&pn->l2tp_eth_lock);
 
 	return 0;
 
@@ -342,22 +318,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
 	return rc;
 }
 
-static __net_init int l2tp_eth_init_net(struct net *net)
-{
-	struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id);
-
-	INIT_LIST_HEAD(&pn->l2tp_eth_dev_list);
-	spin_lock_init(&pn->l2tp_eth_lock);
-
-	return 0;
-}
-
-static struct pernet_operations l2tp_eth_net_ops = {
-	.init = l2tp_eth_init_net,
-	.id   = &l2tp_eth_net_id,
-	.size = sizeof(struct l2tp_eth_net),
-};
-
 
 static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = {
 	.session_create	= l2tp_eth_create,
@@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void)
 
 	err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops);
 	if (err)
-		goto out;
-
-	err = register_pernet_device(&l2tp_eth_net_ops);
-	if (err)
-		goto out_unreg;
+		goto err;
 
 	pr_info("L2TP ethernet pseudowire support (L2TPv3)\n");
 
 	return 0;
 
-out_unreg:
-	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
-out:
+err:
 	return err;
 }
 
 static void __exit l2tp_eth_exit(void)
 {
-	unregister_pernet_device(&l2tp_eth_net_ops);
 	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
 }
 
-- 
2.14.2

^ permalink raw reply related

* [PATCH net 0/2] udp: fix early demux for mcast packets
From: Paolo Abeni @ 2017-09-28 13:51 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller

Currently the early demux callbacks do not perform source address validation.
This is not an issue for TCP or UDP unicast, where the early demux
is only allowed for connected sockets and the source address is validated
for the first packet and never change.

The UDP protocol currently allows early demux also for unconnected multicast
sockets, and we are not currently doing any validation for them, after that
the first packet lands on the socket: beyond ignoring the rp_filter - if 
enabled - any kind of martian sources are also allowed.

This series addresses the issue allowing the early demux callback to return an
error code, and performing the proper checks for unconnected UDP multicast
sockets before leveraging the rx dst cache.

Alternatively we could disable the early demux for unconnected mcast sockets,
but that would cause relevant performance regression - around 50% - while with
this series, with full rp_filter in place, we keep the regression to a more 
moderate level.

Paolo Abeni (2):
  IPv4: early demux can return an error code
  udp: perform source validation for mcast early demux

 include/net/protocol.h |  4 ++--
 include/net/route.h    |  4 +++-
 include/net/tcp.h      |  2 +-
 include/net/udp.h      |  2 +-
 net/ipv4/ip_input.c    | 25 +++++++++++++++----------
 net/ipv4/route.c       | 46 ++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_ipv4.c    |  9 +++++----
 net/ipv4/udp.c         | 24 ++++++++++++++++++------
 8 files changed, 71 insertions(+), 45 deletions(-)

-- 
2.13.5

^ permalink raw reply

* [PATCH net 1/2] IPv4: early demux can return an error code
From: Paolo Abeni @ 2017-09-28 13:51 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller
In-Reply-To: <cover.1506606381.git.pabeni@redhat.com>

Currently no error is emitted, but this infrastructure will
used by the next patch to allow source address validation
for mcast sockets.
Since early demux can do a route lookup and an ipv4 route
lookup can return an error code this is consistent with the
current ipv4 route infrastructure.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
This same patch is also present in the RFC series 
"udp: full early demux for unconnected sockets", I'll drop
it from such series on an eventual next re-submission.
---
 include/net/protocol.h |  4 ++--
 include/net/tcp.h      |  2 +-
 include/net/udp.h      |  2 +-
 net/ipv4/ip_input.c    | 25 +++++++++++++++----------
 net/ipv4/tcp_ipv4.c    |  9 +++++----
 net/ipv4/udp.c         | 11 ++++++-----
 6 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 65ba335b0e7e..4fc75f7ae23b 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -39,8 +39,8 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	void			(*early_demux)(struct sk_buff *skb);
-	void                    (*early_demux_handler)(struct sk_buff *skb);
+	int			(*early_demux)(struct sk_buff *skb);
+	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	unsigned int		no_policy:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3bc910a9bfc6..89974c5286d8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32);
 
 void tcp_shutdown(struct sock *sk, int how);
 
-void tcp_v4_early_demux(struct sk_buff *skb);
+int tcp_v4_early_demux(struct sk_buff *skb);
 int tcp_v4_rcv(struct sk_buff *skb);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
diff --git a/include/net/udp.h b/include/net/udp.h
index 12dfbfe2e2d7..6c759c8594e2 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
 }
 
-void udp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
 bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fa2dc8f692c6..57fc13c6ab2b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -311,9 +311,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt;
+	int (*edemux)(struct sk_buff *skb);
 	struct net_device *dev = skb->dev;
-	void (*edemux)(struct sk_buff *skb);
+	struct rtable *rt;
+	int err;
 
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			edemux(skb);
+			err = edemux(skb);
+			if (unlikely(err))
+				goto drop_error;
 			/* must reload iph, skb->head might have changed */
 			iph = ip_hdr(skb);
 		}
@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_valid_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, dev);
-		if (unlikely(err)) {
-			if (err == -EXDEV)
-				__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
-			goto drop;
-		}
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, dev);
+		if (unlikely(err))
+			goto drop_error;
 	}
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
+
+drop_error:
+	if (err == -EXDEV)
+		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+	goto drop;
 }
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..85164d4d3e53 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1503,23 +1503,23 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	struct sock *sk;
 
 	if (skb->pkt_type != PACKET_HOST)
-		return;
+		return 0;
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	th = tcp_hdr(skb);
 
 	if (th->doff < sizeof(struct tcphdr) / 4)
-		return;
+		return 0;
 
 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 				       iph->saddr, th->source,
@@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 				skb_dst_set_noref(skb, dst);
 		}
 	}
+	return 0;
 }
 
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ef29df8648e4..9b30f821fe96 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 	return NULL;
 }
 
-void udp_v4_early_demux(struct sk_buff *skb)
+int udp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 
 	/* validate the packet */
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	uh = udp_hdr(skb);
@@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 
 		if (!in_dev)
-			return;
+			return 0;
 
 		/* we are supposed to accept bcast packets */
 		if (skb->pkt_type == PACKET_MULTICAST) {
 			ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
 					       iph->protocol);
 			if (!ours)
-				return;
+				return 0;
 		}
 
 		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
@@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	}
 
 	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
-		return;
+		return 0;
 
 	skb->sk = sk;
 	skb->destructor = sock_efree;
@@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		 */
 		skb_dst_set_noref(skb, dst);
 	}
+	return 0;
 }
 
 int udp_rcv(struct sk_buff *skb)
-- 
2.13.5

^ permalink raw reply related

* [PATCH net 2/2] udp: perform source validation for mcast early demux
From: Paolo Abeni @ 2017-09-28 13:51 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller
In-Reply-To: <cover.1506606381.git.pabeni@redhat.com>

The UDP early demux can leverate the rx dst cache even for
multicast unconnected sockets.

In such scenario the ipv4 source address is validated only on
the first packet in the given flow. After that, when we fetch
the dst entry  from the socket rx cache, we stop enforcing
the rp_filter and we even start accepting any kind of martian
addresses.

Disabling the dst cache for unconnected multicast socket will
cause large performace regression, nearly reducing by half the
max ingress tput.

Instead we factor out a route helper to completely validate an
skb source address for multicast packets and we call it from
the UDP early demux for mcast packets landing on unconnected
sockets, after successful fetching the related cached dst entry.

This still gives a measurable, but limited performance
regression:

		rp_filter = 0		rp_filter = 1
edmux disabled:	1182 Kpps		1127 Kpps
edmux before:	2238 Kpps		2238 Kpps
edmux after:	2037 Kpps		2019 Kpps

The above figures are on top of current net tree.
Applying the net-next commit 6e617de84e87 ("net: avoid a full
fib lookup when rp_filter is disabled.") the delta with
rp_filter == 0 will decrease even more.

Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/route.h |  4 +++-
 net/ipv4/route.c    | 46 ++++++++++++++++++++++++++--------------------
 net/ipv4/udp.c      | 13 ++++++++++++-
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 57dfc6850d37..d538e6db1afe 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	fl4->fl4_gre_key = gre_key;
 	return ip_route_output_key(net, fl4);
 }
-
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev,
+			  struct in_device *in_dev, u32 *itag);
 int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
 			 u8 tos, struct net_device *devin);
 int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94d4cd2d5ea4..ac6fde5d45f1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 EXPORT_SYMBOL(rt_dst_alloc);
 
 /* called in rcu_read_lock() section */
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-				u8 tos, struct net_device *dev, int our)
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev,
+			  struct in_device *in_dev, u32 *itag)
 {
-	struct rtable *rth;
-	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	unsigned int flags = RTCF_MULTICAST;
-	u32 itag = 0;
 	int err;
 
 	/* Primary sanity checks. */
-
 	if (!in_dev)
 		return -EINVAL;
 
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
 	    skb->protocol != htons(ETH_P_IP))
-		goto e_inval;
+		return -EINVAL;
 
 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
-		goto e_inval;
+		return -EINVAL;
 
 	if (ipv4_is_zeronet(saddr)) {
 		if (!ipv4_is_local_multicast(daddr))
-			goto e_inval;
+			return -EINVAL;
 	} else {
 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
-					  in_dev, &itag);
+					  in_dev, itag);
 		if (err < 0)
-			goto e_err;
+			return err;
 	}
+	return 0;
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			     u8 tos, struct net_device *dev, int our)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	unsigned int flags = RTCF_MULTICAST;
+	struct rtable *rth;
+	u32 itag = 0;
+	int err;
+
+	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
+	if (err)
+		return err;
+
 	if (our)
 		flags |= RTCF_LOCAL;
 
 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
-		goto e_nobufs;
+		return -ENOBUFS;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	rth->dst.tclassid = itag;
@@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	skb_dst_set(skb, &rth->dst);
 	return 0;
-
-e_nobufs:
-	return -ENOBUFS;
-e_inval:
-	return -EINVAL;
-e_err:
-	return err;
 }
 
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9b30f821fe96..5676237d2b0f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 int udp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
+	struct in_device *in_dev = NULL;
 	const struct iphdr *iph;
 	const struct udphdr *uh;
 	struct sock *sk = NULL;
@@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
 
 	if (skb->pkt_type == PACKET_BROADCAST ||
 	    skb->pkt_type == PACKET_MULTICAST) {
-		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+		in_dev = __in_dev_get_rcu(skb->dev);
 
 		if (!in_dev)
 			return 0;
@@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb)
 	if (dst)
 		dst = dst_check(dst, 0);
 	if (dst) {
+		u32 itag = 0;
+
 		/* set noref for now.
 		 * any place which wants to hold dst has to call
 		 * dst_hold_safe()
 		 */
 		skb_dst_set_noref(skb, dst);
+
+		/* for unconnected multicast sockets we need to validate
+		 * the source on each packet
+		 */
+		if (!inet_sk(sk)->inet_daddr && in_dev)
+			return ip_mc_validate_source(skb, iph->daddr,
+						     iph->saddr, iph->tos,
+						     skb->dev, in_dev, &itag);
 	}
 	return 0;
 }
-- 
2.13.5

^ permalink raw reply related

* Re: [PATCH net-next 2/2] tools: bpf: add bpftool
From: Jakub Kicinski @ 2017-09-28 13:59 UTC (permalink / raw)
  To: davem; +Cc: netdev, daniel, alexei.starovoitov, hannes, dsahern, oss-drivers
In-Reply-To: <20170926153522.31500-3-jakub.kicinski@netronome.com>

On Tue, 26 Sep 2017 08:35:22 -0700, Jakub Kicinski wrote:
> Add a simple tool for querying and updating BPF objects on the system.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Simon Horman <simon.horman@netronome.com>

Dave, I got some late review nitpicks internally which I think are
worth addressing.  Would you mind dropping this series?  I will post v2
with the cleanups and README included.

^ permalink raw reply

* Re: [PATCH v4 1/2] expose stack entry function
From: Nicolas Dichtel @ 2017-09-28 14:11 UTC (permalink / raw)
  To: Amine Kherbouche, netdev, xeb, roopa; +Cc: equinox
In-Reply-To: <d34f8440be3e1d735c694c6899c5a8b14e9a70ea.1506590878.git.amine.kherbouche@6wind.com>

Le 28/09/2017 à 11:34, Amine Kherbouche a écrit :
> Exporting mpls_forward() function to be able to be called from elsewhere
> such as MPLSoverGRE in the next commit.

I'm nitpicking, but the commit title is too generic. What about something like
'mpls: export mpls_forward()'?

When parsing history, the user knows precisely what is done in the commit
without openning it.


Regards,
Nicolas

^ permalink raw reply

* Re: [PATCH net] l2tp: fix l2tp_eth module loading
From: Tom Parkin @ 2017-09-28 14:17 UTC (permalink / raw)
  To: Guillaume Nault; +Cc: netdev, James Chapman
In-Reply-To: <a7d351fd5948fc69ef17da3c170cd7c152f899c4.1506606179.git.g.nault@alphalink.fr>

[-- Attachment #1: Type: text/plain, Size: 4566 bytes --]

On Thu, Sep 28, 2017 at 03:44:38PM +0200, Guillaume Nault wrote:
> The l2tp_eth module crashes if its netlink callbacks are run when the
> pernet data aren't initialised.
> 
> We should normally register_pernet_device() before the genl callbacks.
> However, the pernet data only maintain a list of l2tpeth interfaces,
> and this list is never used. So let's just drop pernet handling
> instead.
> 
> Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
> Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>

Whoops.  I think this was intended to clear up the devices in the net
namespace, but since l2tp_core.c already deletes tunnels on namespace
exit I don't think it's necessary for l2tp_eth.c to do anything more.

> ---
>  net/l2tp/l2tp_eth.c | 51 ++-------------------------------------------------
>  1 file changed, 2 insertions(+), 49 deletions(-)
> 
> diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
> index 87da9ef61860..014a7bc2a872 100644
> --- a/net/l2tp/l2tp_eth.c
> +++ b/net/l2tp/l2tp_eth.c
> @@ -44,7 +44,6 @@ struct l2tp_eth {
>  	struct net_device	*dev;
>  	struct sock		*tunnel_sock;
>  	struct l2tp_session	*session;
> -	struct list_head	list;
>  	atomic_long_t		tx_bytes;
>  	atomic_long_t		tx_packets;
>  	atomic_long_t		tx_dropped;
> @@ -58,17 +57,6 @@ struct l2tp_eth_sess {
>  	struct net_device	*dev;
>  };
>  
> -/* per-net private data for this module */
> -static unsigned int l2tp_eth_net_id;
> -struct l2tp_eth_net {
> -	struct list_head l2tp_eth_dev_list;
> -	spinlock_t l2tp_eth_lock;
> -};
> -
> -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
> -{
> -	return net_generic(net, l2tp_eth_net_id);
> -}
>  
>  static int l2tp_eth_dev_init(struct net_device *dev)
>  {
> @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev)
>  
>  static void l2tp_eth_dev_uninit(struct net_device *dev)
>  {
> -	struct l2tp_eth *priv = netdev_priv(dev);
> -	struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev));
> -
> -	spin_lock(&pn->l2tp_eth_lock);
> -	list_del_init(&priv->list);
> -	spin_unlock(&pn->l2tp_eth_lock);
>  	dev_put(dev);
>  }
>  
> @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
>  	struct l2tp_eth *priv;
>  	struct l2tp_eth_sess *spriv;
>  	int rc;
> -	struct l2tp_eth_net *pn;
>  
>  	if (cfg->ifname) {
>  		strlcpy(name, cfg->ifname, IFNAMSIZ);
> @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
>  	priv = netdev_priv(dev);
>  	priv->dev = dev;
>  	priv->session = session;
> -	INIT_LIST_HEAD(&priv->list);
>  
>  	priv->tunnel_sock = tunnel->sock;
>  	session->recv_skb = l2tp_eth_dev_recv;
> @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
>  	strlcpy(session->ifname, dev->name, IFNAMSIZ);
>  
>  	dev_hold(dev);
> -	pn = l2tp_eth_pernet(dev_net(dev));
> -	spin_lock(&pn->l2tp_eth_lock);
> -	list_add(&priv->list, &pn->l2tp_eth_dev_list);
> -	spin_unlock(&pn->l2tp_eth_lock);
>  
>  	return 0;
>  
> @@ -342,22 +318,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
>  	return rc;
>  }
>  
> -static __net_init int l2tp_eth_init_net(struct net *net)
> -{
> -	struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id);
> -
> -	INIT_LIST_HEAD(&pn->l2tp_eth_dev_list);
> -	spin_lock_init(&pn->l2tp_eth_lock);
> -
> -	return 0;
> -}
> -
> -static struct pernet_operations l2tp_eth_net_ops = {
> -	.init = l2tp_eth_init_net,
> -	.id   = &l2tp_eth_net_id,
> -	.size = sizeof(struct l2tp_eth_net),
> -};
> -
>  
>  static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = {
>  	.session_create	= l2tp_eth_create,
> @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void)
>  
>  	err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops);
>  	if (err)
> -		goto out;
> -
> -	err = register_pernet_device(&l2tp_eth_net_ops);
> -	if (err)
> -		goto out_unreg;
> +		goto err;
>  
>  	pr_info("L2TP ethernet pseudowire support (L2TPv3)\n");
>  
>  	return 0;
>  
> -out_unreg:
> -	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
> -out:
> +err:
>  	return err;
>  }
>  
>  static void __exit l2tp_eth_exit(void)
>  {
> -	unregister_pernet_device(&l2tp_eth_net_ops);
>  	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
>  }
>  
> -- 
> 2.14.2
> 

-- 
Tom Parkin
Katalix Systems Ltd
http://www.katalix.com
Catalysts for your Embedded Linux software development

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply

* Re: [PATCH v4 4/9] em28xx: fix em28xx_dvb_init for KASAN
From: Arnd Bergmann @ 2017-09-28 14:30 UTC (permalink / raw)
  To: Andrey Ryabinin
  Cc: David Laight, Mauro Carvalho Chehab, Jiri Pirko, Arend van Spriel,
	Kalle Valo, David S. Miller, Alexander Potapenko, Dmitry Vyukov,
	Masahiro Yamada, Michal Marek, Andrew Morton, Kees Cook,
	Geert Uytterhoeven, Greg Kroah-Hartman,
	linux-media@vger.kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, "linux-wireless@v
In-Reply-To: <2631e8a6-03f2-69ea-d889-afd9a345e7ef@virtuozzo.com>

On Thu, Sep 28, 2017 at 6:09 AM, Andrey Ryabinin
<aryabinin@virtuozzo.com> wrote:
> On 09/27/2017 04:26 PM, Arnd Bergmann wrote:
>> On Tue, Sep 26, 2017 at 9:49 AM, Andrey Ryabinin
>> <aryabinin@virtuozzo.com> wrote:

>> --- a/include/linux/string.h
>> +++ b/include/linux/string.h
>> @@ -227,7 +227,7 @@ static inline const char *kbasename(const char *path)
>>  #define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline))
>>  #define __RENAME(x) __asm__(#x)
>>
>> -void fortify_panic(const char *name) __noreturn __cold;
>> +void fortify_panic(const char *name) __cold;
>>  void __read_overflow(void) __compiletime_error("detected read beyond
>> size of object passed as 1st parameter");
>>  void __read_overflow2(void) __compiletime_error("detected read beyond
>> size of object passed as 2nd parameter");
>>  void __read_overflow3(void) __compiletime_error("detected read beyond
>> size of object passed as 3rd parameter");
>>
>> I don't immediately see why the __noreturn changes the behavior here, any idea?
>>
>
>
> At first I thought that this somehow might be related to __asan_handle_no_return(). GCC calls it
> before noreturn function. So I made patch to remove generation of these calls (we don't need them in the kernel anyway)
> but it didn't help. It must be something else than.

I made a reduced test case yesterday (see http://paste.ubuntu.com/25628030/),
and it shows the same behavior with and without the sanitizer, it uses 128
bytes without the noreturn attribute and 480 bytes when its added, the sanitizer
adds a factor of 1.5x on top. It's possible that I did something wrong while
reducing, since the original driver file uses very little stack (a few hundred
bytes) without -fsanitize=kernel-address, but finding out what happens in
the reduced case may still help understand the other one.

        Arnd

^ permalink raw reply

* Re: EBPF-triggered WARNING at mm/percpu.c:1361 in v4-14-rc2
From: Daniel Borkmann @ 2017-09-28 14:37 UTC (permalink / raw)
  To: Mark Rutland, linux-kernel, linux-mm, netdev, syzkaller
  Cc: David S. Miller, Alexei Starovoitov, Tejun Heo, Christoph Lameter
In-Reply-To: <20170928112727.GA11310@leverpostej>

On 09/28/2017 01:27 PM, Mark Rutland wrote:
> Hi,
>
> While fuzzing v4.14-rc2 with Syzkaller, I found it was possible to trigger the
> warning at mm/percpu.c:1361, on both arm64 and x86_64. This appears to require
> increasing RLIMIT_MEMLOCK, so to the best of my knowledge this cannot be
> triggered by an unprivileged user.
>
> I've included example splats for both x86_64 and arm64, along with a C
> reproducer, inline below.
>
> It looks like dev_map_alloc() requests a percpu alloction of 32776 bytes, which
> is larger than the maximum supported allocation size of 32768 bytes.
>
> I wonder if it would make more sense to pr_warn() for sizes that are too
> large, so that callers don't have to roll their own checks against
> PCPU_MIN_UNIT_SIZE?

Perhaps the pr_warn() should be ratelimited; or could there be an
option where we only return NULL, not triggering a warn at all (which
would likely be what callers might do anyway when checking against
PCPU_MIN_UNIT_SIZE and then bailing out)?

> e.g. something like:
>
> ----
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 59d44d6..f731c45 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -1355,8 +1355,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>          bits = size >> PCPU_MIN_ALLOC_SHIFT;
>          bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
>
> -       if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
> -                    !is_power_of_2(align))) {
> +       if (unlikely(size > PCPU_MIN_UNIT_SIZE)) {
> +               pr_warn("cannot allocate pcpu chunk of size %zu (max %zu)\n",
> +                       size, PCPU_MIN_UNIT_SIZE);
> +               return NULL;
> +       }
> +
> +       if (unlikely(!size || align > PAGE_SIZE || !is_power_of_2(align))) {
>                  WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
>                       size, align);
>                  return NULL;
> ----
>
> Thanks,
> Mark.
>
>
>
> Example splat(x86_64)
> ----
> [  138.144185] illegal size (32776) or align (8) for percpu allocation
> [  138.150452] ------------[ cut here ]------------
> [  138.155074] WARNING: CPU: 1 PID: 2223 at mm/percpu.c:1361 pcpu_alloc+0x7c/0x5f0
> [  138.162369] Modules linked in:
> [  138.165423] CPU: 1 PID: 2223 Comm: repro Not tainted 4.14.0-rc2 #3
> [  138.171593] Hardware name: LENOVO 7484A3G/LENOVO, BIOS 5CKT54AUS 09/07/2009
> [  138.178543] task: ffff881b73069980 task.stack: ffffa36f40f90000
> [  138.184455] RIP: 0010:pcpu_alloc+0x7c/0x5f0
> [  138.188633] RSP: 0018:ffffa36f40f93e00 EFLAGS: 00010286
> [  138.193853] RAX: 0000000000000037 RBX: 0000000000000000 RCX: 0000000000000000
> [  138.200974] RDX: ffff881b7ec94a40 RSI: ffff881b7ec8cbb8 RDI: ffff881b7ec8cbb8
> [  138.208097] RBP: ffffa36f40f93e68 R08: 0000000000000001 R09: 00000000000002c4
> [  138.215219] R10: 0000562a577047f0 R11: ffffffffa10ad7cd R12: ffff881b73216cc0
> [  138.222343] R13: 0000000000000014 R14: 00007ffebeed0900 R15: ffffffffffffffea
> [  138.229463] FS:  00007fef84a15700(0000) GS:ffff881b7ec80000(0000) knlGS:0000000000000000
> [  138.237538] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  138.243274] CR2: 00007fef84497ba0 CR3: 00000001b3235000 CR4: 00000000000406e0
> [  138.250397] Call Trace:
> [  138.252844]  __alloc_percpu+0x10/0x20
> [  138.256508]  dev_map_alloc+0x122/0x1b0
> [  138.260255]  SyS_bpf+0x8f9/0x10b0
> [  138.263570]  ? security_task_setrlimit+0x3e/0x60
> [  138.268184]  ? do_prlimit+0xa6/0x1f0
> [  138.271760]  entry_SYSCALL_64_fastpath+0x13/0x94
> [  138.276372] RIP: 0033:0x7fef84546259
> [  138.279946] RSP: 002b:00007ffebeed09b8 EFLAGS: 00000206 ORIG_RAX: 0000000000000141
> [  138.287503] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fef84546259
> [  138.294627] RDX: 0000000000000014 RSI: 00007ffebeed09d0 RDI: 0000000000000000
> [  138.301749] RBP: 0000562a57704780 R08: 00007fef84810cb0 R09: 00007ffebeed0ae8
> [  138.308874] R10: 0000562a577047f0 R11: 0000000000000206 R12: 0000562a577045d0
> [  138.315997] R13: 00007ffebeed0ae0 R14: 0000000000000000 R15: 0000000000000000
> [  138.323122] Code: fe 00 10 00 00 77 10 48 8b 4d b8 48 89 c8 48 83 e8 01 48 85 c1 74 1e 48 8b 55 b8 48 8b 75 c0 48 c7 c7 90 5e be a0 e8 40 88 f3 ff <0f> ff 45 31 ed e9 5e 02 00 00 4c 8b 6d c0 49 89 cc 49 c1 ec 02
> [  138.341953] ---[ end trace b6e380365bfb8a36 ]---
> ----
>
>
>
> Example splat (arm64)
> ----
> [   17.287365] illegal size (32776) or align (8) for percpu allocation
> [   17.295347] ------------[ cut here ]------------
> [   17.297191] WARNING: CPU: 1 PID: 1440 at mm/percpu.c:1361 pcpu_alloc+0x120/0x9f0
> [   17.307723] Kernel panic - not syncing: panic_on_warn set ...
> [   17.307723]
> [   17.311755] CPU: 1 PID: 1440 Comm: repro Not tainted 4.14.0-rc2-00001-gd7ad33d #115
> [   17.320675] Hardware name: linux,dummy-virt (DT)
> [   17.323858] Call trace:
> [   17.325246] [<ffff200008094e98>] dump_backtrace+0x0/0x558
> [   17.332538] [<ffff200008095410>] show_stack+0x20/0x30
> [   17.340391] [<ffff20000a312628>] dump_stack+0x128/0x1a0
> [   17.342081] [<ffff20000815e330>] panic+0x250/0x518
> [   17.344096] [<ffff20000815e074>] __warn+0x2a4/0x310
> [   17.345654] [<ffff20000a310984>] report_bug+0x1d4/0x290
> [   17.348652] [<ffff2000080957c8>] bug_handler.part.1+0x40/0xf8
> [   17.356873] [<ffff2000080958cc>] bug_handler+0x4c/0x88
> [   17.360543] [<ffff20000808640c>] brk_handler+0x1c4/0x360
> [   17.365076] [<ffff200008081b68>] do_debug_exception+0x118/0x398
> [   17.368297] Exception stack(0xffff80001c82b930 to 0xffff80001c82ba70)
> [   17.372981] b920:                                   0000000000000037 0000000000000000
> [   17.380137] b940: bec1e481d6136f00 dfff200000000000 1fffe40001cbd30c dfff200000000000
> [   17.384902] b960: dfff200000000000 0000000000000000 ffff80001ce6c050 1ffff000039cd809
> [   17.392527] b980: ffff80001ce6c048 ffff80001ce6c068 1ffff000039cd80c 1ffff000039cd80e
> [   17.396935] b9a0: 1ffff000039cd80d ffff20000e1485a0 0000000000000000 0000000000000000
> [   17.404665] b9c0: ffff20000da58140 0000000000000000 00000000014000c0 0000000000000008
> [   17.407064] b9e0: 0000000000000004 000000000000800b 1ffff000039057b9 ffff80001c82bdcc
> [   17.415067] ba00: 0000000000000000 1ffff000039c0b14 1ffff000039c0b12 ffff80001c82ba70
> [   17.419137] ba20: ffff20000850c880 ffff80001c82ba70 ffff20000850c880 0000000080000145
> [   17.426052] ba40: 0000000000000008 ffff20000ae60b88 0001000000000000 00000000f4f4f404
> [   17.437346] ba60: ffff80001c82ba70 ffff20000850c880
> [   17.445759] [<ffff200008083ef0>] el1_dbg+0x18/0x74
> [   17.448272] [<ffff20000850c880>] pcpu_alloc+0x120/0x9f0
> [   17.456523] [<ffff20000850d1c8>] __alloc_percpu+0x30/0x40
> [   17.458412] [<ffff200008425d2c>] dev_map_alloc+0x58c/0x8d8
> [   17.462917] [<ffff2000083f0634>] SyS_bpf+0x86c/0x2d58
> [   17.468126] Exception stack(0xffff80001c82bec0 to 0xffff80001c82c000)
> [   17.470108] bec0: 0000000000000000 0000ffffe78f4898 0000000000000014 0000000000000000
> [   17.474655] bee0: 0000000000000000 0000ffffe78f49f0 0000001000000000 0000001000000000
> [   17.482196] bf00: 0000000000000118 0003ffffffffffff 0101010101010101 0000001000000000
> [   17.486928] bf20: 0000ffffb6468030 0000000000000000 0000ffffb6468028 000000000000071c
> [   17.498389] bf40: 0000ffffb63b8a00 0000aaaaba991028 0000000000000000 0000aaaaba9808f0
> [   17.502871] bf60: 0000000000000000 0000aaaaba980730 0000000000000000 0000000000000000
> [   17.505879] bf80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   17.514090] bfa0: 0000000000000000 0000ffffe78f4870 0000aaaaba9808e0 0000ffffe78f4870
> [   17.520813] bfc0: 0000ffffb63b8a24 0000000080000000 0000000000000000 0000000000000118
> [   17.523888] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [   17.532537] [<ffff2000080846f0>] el0_svc_naked+0x24/0x28
> [   17.540427] SMP: stopping secondary CPUs
> [   17.566662] Kernel Offset: disabled
> [   17.567498] CPU features: 0x002082
> [   17.568238] Memory Limit: none
> [   17.568958] Rebooting in 86400 seconds..
> ----
>
>
>
> C reproducer
> ----
> #include <stdint.h>
> #include <sys/resource.h>
> #include <sys/syscall.h>
> #include <unistd.h>
>
> #include <linux/bpf.h>
>
> /*
>   * Debian Stretch's headers are too old to contain a number of interesting
>   * values, so manually define them to keep things legible...
>   */
> struct LOCALDEF_bpf_attr {
> 	uint32_t	map_type;
> 	uint32_t	key_size;
> 	uint32_t	value_size;
> 	uint32_t	max_entries;
> 	uint32_t	map_flags;
> };
>
> #define LOCALDEF_BPF_MAP_TYPE_DEVMAP 0xe
>
> int main(int argc, char *argv[])
> {
> 	struct rlimit rlimit = {
> 		.rlim_cur = 8 << 20,
> 		.rlim_max = 8 << 20,
> 	};
> 	
> 	setrlimit(RLIMIT_MEMLOCK, &rlimit);
>
> 	struct LOCALDEF_bpf_attr attr = {
> 		.map_type = LOCALDEF_BPF_MAP_TYPE_DEVMAP,
> 		.key_size = 4,
> 		.value_size = 4,
> 		.max_entries = 0x40001,
> 		.map_flags = 0,
> 	};
>
> 	syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
>
> 	return 0;
> }
> ----
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: EBPF-triggered WARNING at mm/percpu.c:1361 in v4-14-rc2
From: Mark Rutland @ 2017-09-28 14:45 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: linux-kernel, linux-mm, netdev, syzkaller, David S. Miller,
	Alexei Starovoitov, Tejun Heo, Christoph Lameter
In-Reply-To: <59CD093A.6030201@iogearbox.net>

On Thu, Sep 28, 2017 at 04:37:46PM +0200, Daniel Borkmann wrote:
> On 09/28/2017 01:27 PM, Mark Rutland wrote:
> >Hi,
> >
> >While fuzzing v4.14-rc2 with Syzkaller, I found it was possible to trigger the
> >warning at mm/percpu.c:1361, on both arm64 and x86_64. This appears to require
> >increasing RLIMIT_MEMLOCK, so to the best of my knowledge this cannot be
> >triggered by an unprivileged user.
> >
> >I've included example splats for both x86_64 and arm64, along with a C
> >reproducer, inline below.
> >
> >It looks like dev_map_alloc() requests a percpu alloction of 32776 bytes, which
> >is larger than the maximum supported allocation size of 32768 bytes.
> >
> >I wonder if it would make more sense to pr_warn() for sizes that are too
> >large, so that callers don't have to roll their own checks against
> >PCPU_MIN_UNIT_SIZE?
> 
> Perhaps the pr_warn() should be ratelimited; or could there be an
> option where we only return NULL, not triggering a warn at all (which
> would likely be what callers might do anyway when checking against
> PCPU_MIN_UNIT_SIZE and then bailing out)?

Those both make sense to me; checking __GFP_NOWARN should be easy
enough.

Just to check, do you think that dev_map_alloc() should explicitly test
the size against PCPU_MIN_UNIT_SIZE, prior to calling pcpu_alloc()?

I can spin both patches if so.

Thanks,
Mark.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: EBPF-triggered WARNING at mm/percpu.c:1361 in v4-14-rc2
From: Tejun Heo @ 2017-09-28 14:52 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-kernel, linux-mm, netdev, syzkaller, Daniel Borkmann,
	David S. Miller, Alexei Starovoitov, Christoph Lameter
In-Reply-To: <20170928112727.GA11310@leverpostej>

Hello,

On Thu, Sep 28, 2017 at 12:27:28PM +0100, Mark Rutland wrote:
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 59d44d6..f731c45 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -1355,8 +1355,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
>         bits = size >> PCPU_MIN_ALLOC_SHIFT;
>         bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
>  
> -       if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
> -                    !is_power_of_2(align))) {
> +       if (unlikely(size > PCPU_MIN_UNIT_SIZE)) {
> +               pr_warn("cannot allocate pcpu chunk of size %zu (max %zu)\n",
> +                       size, PCPU_MIN_UNIT_SIZE);

WARN_ONCE() probably is the better choice here.  We wanna know who
tries to allocate larger than the supported size and increase the size
limit if warranted.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: EBPF-triggered WARNING at mm/percpu.c:1361 in v4-14-rc2
From: Tejun Heo @ 2017-09-28 14:53 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Daniel Borkmann, linux-kernel, linux-mm, netdev, syzkaller,
	David S. Miller, Alexei Starovoitov, Christoph Lameter
In-Reply-To: <20170928144538.GA32487@leverpostej>

Hello,

On Thu, Sep 28, 2017 at 03:45:38PM +0100, Mark Rutland wrote:
> > Perhaps the pr_warn() should be ratelimited; or could there be an
> > option where we only return NULL, not triggering a warn at all (which
> > would likely be what callers might do anyway when checking against
> > PCPU_MIN_UNIT_SIZE and then bailing out)?
> 
> Those both make sense to me; checking __GFP_NOWARN should be easy
> enough.

That also makes sense.

> Just to check, do you think that dev_map_alloc() should explicitly test
> the size against PCPU_MIN_UNIT_SIZE, prior to calling pcpu_alloc()?

But let's please not do this.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: EBPF-triggered WARNING at mm/percpu.c:1361 in v4-14-rc2
From: Daniel Borkmann @ 2017-09-28 15:00 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-kernel, linux-mm, netdev, syzkaller, David S. Miller,
	Alexei Starovoitov, Tejun Heo, Christoph Lameter
In-Reply-To: <20170928144538.GA32487@leverpostej>

On 09/28/2017 04:45 PM, Mark Rutland wrote:
> On Thu, Sep 28, 2017 at 04:37:46PM +0200, Daniel Borkmann wrote:
>> On 09/28/2017 01:27 PM, Mark Rutland wrote:
>>> Hi,
>>>
>>> While fuzzing v4.14-rc2 with Syzkaller, I found it was possible to trigger the
>>> warning at mm/percpu.c:1361, on both arm64 and x86_64. This appears to require
>>> increasing RLIMIT_MEMLOCK, so to the best of my knowledge this cannot be
>>> triggered by an unprivileged user.
>>>
>>> I've included example splats for both x86_64 and arm64, along with a C
>>> reproducer, inline below.
>>>
>>> It looks like dev_map_alloc() requests a percpu alloction of 32776 bytes, which
>>> is larger than the maximum supported allocation size of 32768 bytes.
>>>
>>> I wonder if it would make more sense to pr_warn() for sizes that are too
>>> large, so that callers don't have to roll their own checks against
>>> PCPU_MIN_UNIT_SIZE?
>>
>> Perhaps the pr_warn() should be ratelimited; or could there be an
>> option where we only return NULL, not triggering a warn at all (which
>> would likely be what callers might do anyway when checking against
>> PCPU_MIN_UNIT_SIZE and then bailing out)?
>
> Those both make sense to me; checking __GFP_NOWARN should be easy
> enough.
>
> Just to check, do you think that dev_map_alloc() should explicitly test
> the size against PCPU_MIN_UNIT_SIZE, prior to calling pcpu_alloc()?

Looks like there are users of __alloc_percpu_gfp() with __GFP_NOWARN
in couple of places already, but __GFP_NOWARN is ignored. Would make
sense to support that indeed to avoid throwing the warn and just let
the caller bail out when it sees the NULL as usual. In some cases (like
the current ones) this makes sense, others probably not too much and
a WARN would be preferred way, but __alloc_percpu_gfp() could provide
such option to simplify some of the code that pre checks against the
limit on PCPU_MIN_UNIT_SIZE before calling the allocator and doesn't
throw a WARN either; and most likely such check is just to prevent
the user from seeing exactly this splat.

Thanks,
Daniel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH V3] r8152: add Linksys USB3GIGV1 id
From: Doug Anderson @ 2017-09-28 15:11 UTC (permalink / raw)
  To: Grant Grundler
  Cc: Hayes Wang, Oliver Neukum, linux-usb, David S . Miller, LKML,
	netdev
In-Reply-To: <CANEJEGvK94v_SV9xWmDaiA4CZOiroKZVhfCH4vydF6_Su2_PyA@mail.gmail.com>

Hi,

On Wed, Sep 27, 2017 at 5:07 PM, Grant Grundler <grundler@chromium.org> wrote:
>>>  #define DELL_VENDOR_ID         0x413C
>>>  #define REALTEK_VENDOR_ID      0x0bda
>>>  #define SAMSUNG_VENDOR_ID      0x04e8
>>> +#define LINKSYS_VENDOR_ID      0x13b1
>>>  #define LENOVO_VENDOR_ID       0x17ef
>>
>> Slight nit that "LI" sorts after "LE".  You got it right in the other case...
>
> The list isn't sorted by any rational thing I can see.  I managed to
> check my OCD reaction to sort the list numerically. :)

Whoops, you're right.  It seems to be in a random order.  I just saw
LE, LI, and N sorted properly and jumped to a conclusion.  In any
case, if it's all the same to you it'd be nice if you were consistent
between LENOVO/LINKSYS in the two files.  :-P

-Doug

^ permalink raw reply

* Re: [PATCH net] l2tp: fix l2tp_eth module loading
From: Guillaume Nault @ 2017-09-28 15:14 UTC (permalink / raw)
  To: Tom Parkin; +Cc: netdev, James Chapman
In-Reply-To: <20170928141728.GA3107@jackdaw>

On Thu, Sep 28, 2017 at 03:17:28PM +0100, Tom Parkin wrote:
> On Thu, Sep 28, 2017 at 03:44:38PM +0200, Guillaume Nault wrote:
> > The l2tp_eth module crashes if its netlink callbacks are run when the
> > pernet data aren't initialised.
> > 
> > We should normally register_pernet_device() before the genl callbacks.
> > However, the pernet data only maintain a list of l2tpeth interfaces,
> > and this list is never used. So let's just drop pernet handling
> > instead.
> > 
> > Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support")
> > Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
> 
> Whoops.  I think this was intended to clear up the devices in the net
> namespace,
Yes, that's what I thought too. That's what virtual devices are
supposed to do and think I'll eventually implement this at a later
time.

> but since l2tp_core.c already deletes tunnels on namespace
> exit I don't think it's necessary for l2tp_eth.c to do anything more.
> 
Well, removing l2tpeth devices is just a side effect of closing the
tunnel. But the tunnel may be in a different namespace than the device,
in case the later was moved after creation. In this case, the l2tpeth
interface isn't deleted when its namespace is destroyed. It's moved
to the initial namespace instead (because, for now, l2tpeth devices
don't implement ->rtnl_link_ops).

These are shortcomings I'd like to fix, but there are more important
issues to tackle first.

^ permalink raw reply

* Re: [PATCH RFC 3/5] Add KSZ8795 switch driver
From: Pavel Machek @ 2017-09-28 15:24 UTC (permalink / raw)
  To: Tristram.Ha
  Cc: andrew, muvarov, nathan.leigh.conrad, vivien.didelot, f.fainelli,
	netdev, linux-kernel, Woojung.Huh
In-Reply-To: <93AF473E2DA327428DE3D46B72B1E9FD41123010@CHN-SV-EXMX02.mchp-main.com>

[-- Attachment #1: Type: text/plain, Size: 683 bytes --]

> > > > Similar code will be needed by other drivers, right?
> > >
> > > Although KSZ8795 and KSZ8895 may use the same code, the other
> > > chips will have different code.
> > 
> > Ok, please make sure code is shared between these two.
> 
> The exact function probably cannot be shared between KSZ8795
> and KSZ8895 drivers because although the register constants look
> the same but their exact locations are different in the 2 chips.

Put the code into header as static inline and include it from both
places?

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [PATCH RFC 0/6] Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers.
From: Pavel Machek @ 2017-09-28 15:24 UTC (permalink / raw)
  To: Tristram.Ha
  Cc: andrew, muvarov, nathan.leigh.conrad, vivien.didelot, f.fainelli,
	netdev, linux-kernel, Woojung.Huh
In-Reply-To: <93AF473E2DA327428DE3D46B72B1E9FD41122FFB@CHN-SV-EXMX02.mchp-main.com>

[-- Attachment #1: Type: text/plain, Size: 796 bytes --]

Hi!

> > >  drivers/net/dsa/microchip/Makefile     |    2 +-
> > >  drivers/net/dsa/microchip/ksz9477.c    | 1317
> > ++++++++++++++++++++++++++++++++
> > 
> > We already have ksz_9477_reg.h. So should this be ksz_9477.c for
> > consistency?
> 
> The product name is KSZ9477 and other switches are also like KSZ####,
> so I would prefer to have no separation between KSZ and the product
> number.  I think the file ksz_9477_reg.h was named that way because
> the other files were named ksz_common.c and ksz_spi.c.  If need to
> we can change the file name.

I don't care either way, but please make it consistent.

Thanks,
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* [PATCH net-next RFC 0/9] net: dsa: PTP timestamping for mv88e6xxx
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff

This patch series adds support for PTP timestamping through the DSA
framework, as well as an implementation for mv8e6xxx switches.

This implementation was targeted at a National Instruments platform
that uses the Marvell 88E6341 (Topaz). I've tried to enable support on
other Marvell switches where the register interfaces seemed compatible,
but I don't have the hardware to verify their operation myself.

This series probably ties in well with Richard's comment last week
("Re: [RFC net-next 0/5] TSN: Add qdisc-based config interfaces for
traffic shapers") about figuring out proper interfaces for managing
switch-level PTP timestamps.

A couple patches that I expect may need further polishing:

- Patch #2: We expose the switch time as a PTP clock but don't support
  adjustment (max_adj=0). Our platform adjusted a systemwide oscillator
  from userspace, so we didn't need adjustment at this layer, but other
  PTP clock drivers support this and we probably should too.

- Patch #3: The GPIO config support is handled in a very simple manner.
  I suspect a longer term goal would be to use pinctrl here.

- Patch #6: the dsa_switch pointer and port index is plumbed from
  dsa_device_ops::rcv so that we can call the correct port_rxtstamp
  method. This involved instrumenting all of the *_tag_rcv functions in
  a way that's kind of a kludge and that I'm not terribly happy with.

This applies to net-next as of 14a0d032f4ec.

Feedback is appreciated.

-- brandon


Brandon Streiff (9):
  net: dsa: mv88e6xxx: add accessors for PTP/TAI registers
  net: dsa: mv88e6xxx: expose switch time as a PTP hardware clock
  net: dsa: mv88e6xxx: add support for GPIO configuration
  net: dsa: mv88e6xxx: add support for event capture
  net: dsa: forward hardware timestamping ioctls to switch driver
  net: dsa: forward timestamping callbacks to switch drivers
  ptp: add offset for reserved field to header
  net: dsa: mv88e6xxx: add rx/tx timestamping support
  net: dsa: mv88e6xxx: add workaround for 6341 timestamping

 drivers/net/dsa/mv88e6xxx/Kconfig    |  10 +
 drivers/net/dsa/mv88e6xxx/Makefile   |   2 +
 drivers/net/dsa/mv88e6xxx/chip.c     |  65 +++++
 drivers/net/dsa/mv88e6xxx/chip.h     |  71 +++++
 drivers/net/dsa/mv88e6xxx/global2.c  | 244 ++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.h  |  59 +++-
 drivers/net/dsa/mv88e6xxx/hwtstamp.c | 548 +++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/hwtstamp.h | 171 +++++++++++
 drivers/net/dsa/mv88e6xxx/ptp.c      | 493 +++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/ptp.h      |  99 +++++++
 include/linux/ptp_classify.h         |   1 +
 include/net/dsa.h                    |  28 +-
 net/dsa/dsa.c                        |  39 ++-
 net/dsa/slave.c                      |  67 ++++-
 net/dsa/tag_brcm.c                   |   6 +-
 net/dsa/tag_dsa.c                    |   6 +-
 net/dsa/tag_edsa.c                   |   6 +-
 net/dsa/tag_ksz.c                    |   6 +-
 net/dsa/tag_lan9303.c                |   6 +-
 net/dsa/tag_mtk.c                    |   6 +-
 net/dsa/tag_qca.c                    |   6 +-
 net/dsa/tag_trailer.c                |   6 +-
 22 files changed, 1929 insertions(+), 16 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/hwtstamp.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/hwtstamp.h
 create mode 100644 drivers/net/dsa/mv88e6xxx/ptp.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/ptp.h

-- 
2.1.4

^ permalink raw reply

* [PATCH net-next RFC 1/9] net: dsa: mv88e6xxx: add accessors for PTP/TAI registers
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

This patch implements support for accessing PTP/TAI registers through
the AVB register interface in the Global 2 register.

The register interface differs slightly between different models; older
models use a 3-bit operations field, while newer models use a 2-bit
field. The operations values and the special "global port" values are
different between the two. This is a similar split to the differences
in the "Ingress Rate" register between models, so, like in that case,
we call the two variants "6352" and "6390" and create an ops structure
to abstract between the two.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c    |   9 ++
 drivers/net/dsa/mv88e6xxx/chip.h    |  17 ++++
 drivers/net/dsa/mv88e6xxx/global2.c | 173 ++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.h |  27 +++++-
 4 files changed, 225 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c6678aa..67efa7b 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2783,6 +2783,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
+	.avb_ops = &mv88e6352_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6290_ops = {
@@ -2819,6 +2820,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.avb_ops = &mv88e6390_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6320_ops = {
@@ -2852,6 +2854,7 @@ static const struct mv88e6xxx_ops mv88e6320_ops = {
 	.reset = mv88e6352_g1_reset,
 	.vtu_getnext = mv88e6185_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
+	.avb_ops = &mv88e6352_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6321_ops = {
@@ -2883,6 +2886,7 @@ static const struct mv88e6xxx_ops mv88e6321_ops = {
 	.reset = mv88e6352_g1_reset,
 	.vtu_getnext = mv88e6185_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6185_g1_vtu_loadpurge,
+	.avb_ops = &mv88e6352_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6341_ops = {
@@ -2918,6 +2922,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.reset = mv88e6352_g1_reset,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
+	.avb_ops = &mv88e6390_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6350_ops = {
@@ -2984,6 +2989,7 @@ static const struct mv88e6xxx_ops mv88e6351_ops = {
 	.reset = mv88e6352_g1_reset,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
+	.avb_ops = &mv88e6352_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6352_ops = {
@@ -3020,6 +3026,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
+	.avb_ops = &mv88e6352_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6390_ops = {
@@ -3058,6 +3065,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.avb_ops = &mv88e6390_avb_ops,
 };
 
 static const struct mv88e6xxx_ops mv88e6390x_ops = {
@@ -3096,6 +3104,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
+	.avb_ops = &mv88e6390_avb_ops,
 };
 
 static const struct mv88e6xxx_info mv88e6xxx_table[] = {
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 334f6f7..d99d592 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -146,6 +146,7 @@ struct mv88e6xxx_vtu_entry {
 
 struct mv88e6xxx_bus_ops;
 struct mv88e6xxx_irq_ops;
+struct mv88e6xxx_avb_ops;
 
 struct mv88e6xxx_irq {
 	u16 masked;
@@ -342,6 +343,9 @@ struct mv88e6xxx_ops {
 			   struct mv88e6xxx_vtu_entry *entry);
 	int (*vtu_loadpurge)(struct mv88e6xxx_chip *chip,
 			     struct mv88e6xxx_vtu_entry *entry);
+
+	/* Interface to the AVB/PTP registers */
+	const struct mv88e6xxx_avb_ops *avb_ops;
 };
 
 struct mv88e6xxx_irq_ops {
@@ -353,6 +357,19 @@ struct mv88e6xxx_irq_ops {
 	void (*irq_free)(struct mv88e6xxx_chip *chip);
 };
 
+struct mv88e6xxx_avb_ops {
+	int (*port_ptp_read)(struct mv88e6xxx_chip *chip, int port, int addr,
+			     u16 *data, int len);
+	int (*port_ptp_write)(struct mv88e6xxx_chip *chip, int port, int addr,
+			      u16 data);
+	int (*ptp_read)(struct mv88e6xxx_chip *chip, int addr, u16 *data,
+			int len);
+	int (*ptp_write)(struct mv88e6xxx_chip *chip, int addr, u16 data);
+	int (*tai_read)(struct mv88e6xxx_chip *chip, int addr, u16 *data,
+			int len);
+	int (*tai_write)(struct mv88e6xxx_chip *chip, int addr, u16 data);
+};
+
 #define STATS_TYPE_PORT		BIT(0)
 #define STATS_TYPE_BANK0	BIT(1)
 #define STATS_TYPE_BANK1	BIT(2)
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index af07278..b6d0c71 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -590,6 +590,179 @@ int mv88e6xxx_g2_set_eeprom16(struct mv88e6xxx_chip *chip,
 	return 0;
 }
 
+/* Offset 0x16: AVB Command Register
+ * Offset 0x17: AVB Data Register
+ *
+ * There are two different versions of this register interface:
+ *    "6352": 3-bit "op" field, 4-bit "port" field.
+ *    "6390": 2-bit "op" field, 5-bit "port" field.
+ *
+ * The "op" codes are different between the two, as well as the special
+ * port fields for global PTP and TAI configuration.
+ */
+
+/* mv88e6xxx_g2_avb_read -- Read one or multiple 16-bit words.
+ * The hardware supports snapshotting up to four contiguous registers.
+ */
+static int mv88e6xxx_g2_avb_read(struct mv88e6xxx_chip *chip, u16 readop,
+				 u16 *data, int len)
+{
+	int err;
+	int i;
+
+	/* Hardware can only snapshot four words. */
+	if (unlikely(len > 4))
+		return -E2BIG;
+
+	err = mv88e6xxx_g2_update(chip, MV88E6352_G2_AVB_CMD, readop);
+	if (err)
+		return err;
+
+	for (i = 0; i < len; ++i) {
+		err = mv88e6xxx_g2_read(chip, MV88E6352_G2_AVB_DATA,
+					&data[i]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/* mv88e6xxx_g2_avb_write -- Write one 16-bit word. */
+static int mv88e6xxx_g2_avb_write(struct mv88e6xxx_chip *chip, u16 writeop,
+				  u16 data)
+{
+	int err;
+
+	err = mv88e6xxx_g2_write(chip, MV88E6352_G2_AVB_DATA, data);
+	if (err)
+		return err;
+
+	return mv88e6xxx_g2_update(chip, MV88E6352_G2_AVB_CMD, writeop);
+}
+
+static int mv88e6352_port_ptp_read(struct mv88e6xxx_chip *chip, int port,
+				   int addr, u16 *data, int len)
+{
+	u16 readop = (len == 1 ? MV88E6352_G2_AVB_CMD_OP_READ :
+				 MV88E6352_G2_AVB_CMD_OP_READ_INCR) |
+		     (port << 8) | (MV88E6352_G2_AVB_CMD_BLOCK_PTP << 5) |
+		     addr;
+
+	return mv88e6xxx_g2_avb_read(chip, readop, data, len);
+}
+
+static int mv88e6352_port_ptp_write(struct mv88e6xxx_chip *chip, int port,
+				    int addr, u16 data)
+{
+	u16 writeop = MV88E6352_G2_AVB_CMD_OP_WRITE | (port << 8) |
+		      (MV88E6352_G2_AVB_CMD_BLOCK_PTP << 5) | addr;
+
+	return mv88e6xxx_g2_avb_write(chip, writeop, data);
+}
+
+static int mv88e6352_ptp_read(struct mv88e6xxx_chip *chip, int addr,
+			      u16 *data, int len)
+{
+	return mv88e6352_port_ptp_read(chip,
+				       MV88E6352_G2_AVB_CMD_PORT_PTPGLOBAL,
+				       addr, data, len);
+}
+
+static int mv88e6352_ptp_write(struct mv88e6xxx_chip *chip, int addr,
+			       u16 data)
+{
+	return mv88e6352_port_ptp_write(chip,
+					MV88E6352_G2_AVB_CMD_PORT_PTPGLOBAL,
+					addr, data);
+}
+
+static int mv88e6352_tai_read(struct mv88e6xxx_chip *chip, int addr,
+			      u16 *data, int len)
+{
+	return mv88e6352_port_ptp_read(chip,
+				       MV88E6352_G2_AVB_CMD_PORT_TAIGLOBAL,
+				       addr, data, len);
+}
+
+static int mv88e6352_tai_write(struct mv88e6xxx_chip *chip, int addr,
+			       u16 data)
+{
+	return mv88e6352_port_ptp_write(chip,
+					MV88E6352_G2_AVB_CMD_PORT_TAIGLOBAL,
+					addr, data);
+}
+
+const struct mv88e6xxx_avb_ops mv88e6352_avb_ops = {
+	.port_ptp_read		= mv88e6352_port_ptp_read,
+	.port_ptp_write		= mv88e6352_port_ptp_write,
+	.ptp_read		= mv88e6352_ptp_read,
+	.ptp_write		= mv88e6352_ptp_write,
+	.tai_read		= mv88e6352_tai_read,
+	.tai_write		= mv88e6352_tai_write,
+};
+
+static int mv88e6390_port_ptp_read(struct mv88e6xxx_chip *chip, int port,
+				   int addr, u16 *data, int len)
+{
+	u16 readop = (len == 1 ? MV88E6390_G2_AVB_CMD_OP_READ :
+				 MV88E6390_G2_AVB_CMD_OP_READ_INCR) |
+		     (port << 8) | (MV88E6352_G2_AVB_CMD_BLOCK_PTP << 5) |
+		     addr;
+
+	return mv88e6xxx_g2_avb_read(chip, readop, data, len);
+}
+
+static int mv88e6390_port_ptp_write(struct mv88e6xxx_chip *chip, int port,
+				    int addr, u16 data)
+{
+	u16 writeop = MV88E6390_G2_AVB_CMD_OP_WRITE | (port << 8) |
+		      (MV88E6352_G2_AVB_CMD_BLOCK_PTP << 5) | addr;
+
+	return mv88e6xxx_g2_avb_write(chip, writeop, data);
+}
+
+static int mv88e6390_ptp_read(struct mv88e6xxx_chip *chip, int addr,
+			      u16 *data, int len)
+{
+	return mv88e6390_port_ptp_read(chip,
+				       MV88E6390_G2_AVB_CMD_PORT_PTPGLOBAL,
+				       addr, data, len);
+}
+
+static int mv88e6390_ptp_write(struct mv88e6xxx_chip *chip, int addr,
+			       u16 data)
+{
+	return mv88e6390_port_ptp_write(chip,
+					MV88E6390_G2_AVB_CMD_PORT_PTPGLOBAL,
+					addr, data);
+}
+
+static int mv88e6390_tai_read(struct mv88e6xxx_chip *chip, int addr,
+			      u16 *data, int len)
+{
+	return mv88e6390_port_ptp_read(chip,
+				       MV88E6390_G2_AVB_CMD_PORT_TAIGLOBAL,
+				       addr, data, len);
+}
+
+static int mv88e6390_tai_write(struct mv88e6xxx_chip *chip, int addr,
+			       u16 data)
+{
+	return mv88e6390_port_ptp_write(chip,
+					MV88E6390_G2_AVB_CMD_PORT_TAIGLOBAL,
+					addr, data);
+}
+
+const struct mv88e6xxx_avb_ops mv88e6390_avb_ops = {
+	.port_ptp_read		= mv88e6390_port_ptp_read,
+	.port_ptp_write		= mv88e6390_port_ptp_write,
+	.ptp_read		= mv88e6390_ptp_read,
+	.ptp_write		= mv88e6390_ptp_write,
+	.tai_read		= mv88e6390_tai_read,
+	.tai_write		= mv88e6390_tai_write,
+};
+
 /* Offset 0x18: SMI PHY Command Register
  * Offset 0x19: SMI PHY Data Register
  */
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h
index 669f590..642d2b0 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -149,7 +149,26 @@
 #define MV88E6390_G2_EEPROM_ADDR_MASK	0xffff
 
 /* Offset 0x16: AVB Command Register */
-#define MV88E6352_G2_AVB_CMD		0x16
+#define MV88E6352_G2_AVB_CMD			0x16
+#define MV88E6352_G2_AVB_CMD_BUSY		0x8000
+#define MV88E6352_G2_AVB_CMD_OP_READ		0x4000
+#define MV88E6352_G2_AVB_CMD_OP_READ_INCR	0x6000
+#define MV88E6352_G2_AVB_CMD_OP_WRITE		0x3000
+#define MV88E6390_G2_AVB_CMD_OP_READ		0x0000
+#define MV88E6390_G2_AVB_CMD_OP_READ_INCR	0x4000
+#define MV88E6390_G2_AVB_CMD_OP_WRITE		0x6000
+#define MV88E6352_G2_AVB_CMD_PORT_MASK		0x0f00
+#define MV88E6352_G2_AVB_CMD_PORT_TAIGLOBAL	0xe
+#define MV88E6352_G2_AVB_CMD_PORT_PTPGLOBAL	0xf
+#define MV88E6390_G2_AVB_CMD_PORT_MASK		0x1f00
+#define MV88E6390_G2_AVB_CMD_PORT_TAIGLOBAL	0x1e
+#define MV88E6390_G2_AVB_CMD_PORT_PTPGLOBAL	0x1f
+#define MV88E6352_G2_AVB_CMD_BLOCK_PTP		0
+#define MV88E6352_G2_AVB_CMD_BLOCK_AVB		1
+#define MV88E6352_G2_AVB_CMD_BLOCK_QAV		2
+#define MV88E6352_G2_AVB_CMD_BLOCK_QVB		3
+#define MV88E6352_G2_AVB_CMD_BLOCK_MASK		0x00e0
+#define MV88E6352_G2_AVB_CMD_ADDR_MASK		0x001f
 
 /* Offset 0x17: AVB Data Register */
 #define MV88E6352_G2_AVB_DATA		0x17
@@ -267,6 +286,9 @@ int mv88e6xxx_g2_pot_clear(struct mv88e6xxx_chip *chip);
 extern const struct mv88e6xxx_irq_ops mv88e6097_watchdog_ops;
 extern const struct mv88e6xxx_irq_ops mv88e6390_watchdog_ops;
 
+extern const struct mv88e6xxx_avb_ops mv88e6352_avb_ops;
+extern const struct mv88e6xxx_avb_ops mv88e6390_avb_ops;
+
 #else /* !CONFIG_NET_DSA_MV88E6XXX_GLOBAL2 */
 
 static inline int mv88e6xxx_g2_require(struct mv88e6xxx_chip *chip)
@@ -382,6 +404,9 @@ static inline int mv88e6xxx_g2_pot_clear(struct mv88e6xxx_chip *chip)
 static const struct mv88e6xxx_irq_ops mv88e6097_watchdog_ops = {};
 static const struct mv88e6xxx_irq_ops mv88e6390_watchdog_ops = {};
 
+static const struct mv88e6xxx_avb_ops mv88e6352_avb_ops = {};
+static const struct mv88e6xxx_avb_ops mv88e6390_avb_ops = {};
+
 #endif /* CONFIG_NET_DSA_MV88E6XXX_GLOBAL2 */
 
 #endif /* _MV88E6XXX_GLOBAL2_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next RFC 2/9] net: dsa: mv88e6xxx: expose switch time as a PTP hardware clock
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

This patch adds basic support for exposing the 32-bit timestamp counter
inside the mv88e6xxx switch code as a ptp_clock.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 drivers/net/dsa/mv88e6xxx/Kconfig  |  10 +++
 drivers/net/dsa/mv88e6xxx/Makefile |   1 +
 drivers/net/dsa/mv88e6xxx/chip.c   |  20 +++++
 drivers/net/dsa/mv88e6xxx/chip.h   |  16 ++++
 drivers/net/dsa/mv88e6xxx/ptp.c    | 180 +++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/ptp.h    |  83 +++++++++++++++++
 6 files changed, 310 insertions(+)
 create mode 100644 drivers/net/dsa/mv88e6xxx/ptp.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/ptp.h

diff --git a/drivers/net/dsa/mv88e6xxx/Kconfig b/drivers/net/dsa/mv88e6xxx/Kconfig
index 1aaa7a9..ae9e7f7 100644
--- a/drivers/net/dsa/mv88e6xxx/Kconfig
+++ b/drivers/net/dsa/mv88e6xxx/Kconfig
@@ -18,3 +18,13 @@ config NET_DSA_MV88E6XXX_GLOBAL2
 
 	  It is required on most chips. If the chip you compile the support for
 	  doesn't have such registers set, say N here. In doubt, say Y.
+
+config NET_DSA_MV88E6XXX_PTP
+	bool "PTP support for Marvell 88E6xxx"
+	default n
+	depends on NET_DSA_MV88E6XXX_GLOBAL2
+	imply NETWORK_PHY_TIMESTAMPING
+	imply PTP_1588_CLOCK
+	help
+	  Say Y to enable PTP hardware timestamping on Marvell 88E6xxx switch
+	  chips that support it.
diff --git a/drivers/net/dsa/mv88e6xxx/Makefile b/drivers/net/dsa/mv88e6xxx/Makefile
index 5cd5551..47eda30 100644
--- a/drivers/net/dsa/mv88e6xxx/Makefile
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -6,4 +6,5 @@ mv88e6xxx-objs += global1_vtu.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_GLOBAL2) += global2.o
 mv88e6xxx-objs += phy.o
 mv88e6xxx-objs += port.o
+mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o
 mv88e6xxx-objs += serdes.o
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 67efa7b..2ed37d8 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -38,6 +38,7 @@
 #include "global2.h"
 #include "phy.h"
 #include "port.h"
+#include "ptp.h"
 #include "serdes.h"
 
 static void assert_reg_lock(struct mv88e6xxx_chip *chip)
@@ -2033,6 +2034,13 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
+	/* Setup PTP Hardware Clock */
+	if (chip->info->ptp_support) {
+		err = mv88e6xxx_ptp_setup(chip);
+		if (err)
+			goto unlock;
+	}
+
 unlock:
 	mutex_unlock(&chip->reg_lock);
 
@@ -3418,6 +3426,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6191_ops,
 	},
 
@@ -3438,6 +3447,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_EDSA,
+		.ptp_support = true,
 		.ops = &mv88e6240_ops,
 	},
 
@@ -3458,6 +3468,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6290_ops,
 	},
 
@@ -3477,6 +3488,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_EDSA,
+		.ptp_support = true,
 		.ops = &mv88e6320_ops,
 	},
 
@@ -3495,6 +3507,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.atu_move_port_mask = 0xf,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_EDSA,
+		.ptp_support = true,
 		.ops = &mv88e6321_ops,
 	},
 
@@ -3514,6 +3527,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_EDSA,
+		.ptp_support = true,
 		.ops = &mv88e6341_ops,
 	},
 
@@ -3574,6 +3588,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_EDSA,
+		.ptp_support = true,
 		.ops = &mv88e6352_ops,
 	},
 	[MV88E6390] = {
@@ -3593,6 +3608,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6390_ops,
 	},
 	[MV88E6390X] = {
@@ -3612,6 +3628,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.pvt = true,
 		.multi_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6390x_ops,
 	},
 };
@@ -3949,6 +3966,9 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev)
 	struct dsa_switch *ds = dev_get_drvdata(&mdiodev->dev);
 	struct mv88e6xxx_chip *chip = ds->priv;
 
+	if (chip->info->ptp_support)
+		mv88e6xxx_ptp_free(chip);
+
 	mv88e6xxx_phy_destroy(chip);
 	mv88e6xxx_unregister_switch(chip);
 	mv88e6xxx_mdios_unregister(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index d99d592..648bd50 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -16,6 +16,8 @@
 #include <linux/irq.h>
 #include <linux/gpio/consumer.h>
 #include <linux/phy.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/timecounter.h>
 #include <net/dsa.h>
 
 #ifndef UINT64_MAX
@@ -126,6 +128,9 @@ struct mv88e6xxx_info {
 	 */
 	u8 atu_move_port_mask;
 	const struct mv88e6xxx_ops *ops;
+
+	/* Supports PTP */
+	bool ptp_support;
 };
 
 struct mv88e6xxx_atu_entry {
@@ -208,6 +213,17 @@ struct mv88e6xxx_chip {
 	int irq;
 	int device_irq;
 	int watchdog_irq;
+
+	/* This cyclecounter abstracts the switch PTP time.
+	 * reg_lock must be held for any operation that read()s.
+	 */
+	struct cyclecounter	tstamp_cc;
+	struct timecounter	tstamp_tc;
+	struct delayed_work	overflow_work;
+	unsigned long		last_overflow_check;
+
+	struct ptp_clock	*ptp_clock;
+	struct ptp_clock_info	ptp_clock_info;
 };
 
 struct mv88e6xxx_bus_ops {
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c
new file mode 100644
index 0000000..155f8e9
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/ptp.c
@@ -0,0 +1,180 @@
+/*
+ * Marvell 88E6xxx Switch PTP support
+ *
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * Copyright (c) 2017 National Instruments
+ *      Erik Hons <erik.hons@ni.com>
+ *      Brandon Streiff <brandon.streiff@ni.com>
+ *      Dane Wagner <dane.wagner@ni.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "chip.h"
+#include "global2.h"
+#include "ptp.h"
+
+static int mv88e6xxx_tai_read(struct mv88e6xxx_chip *chip, int addr,
+			      u16 *data, int len)
+{
+	if (!chip->info->ops->avb_ops->tai_read)
+		return -EOPNOTSUPP;
+
+	return chip->info->ops->avb_ops->tai_read(chip, addr, data, len);
+}
+
+static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc)
+{
+	struct mv88e6xxx_chip *chip =
+		container_of(cc, struct mv88e6xxx_chip, tstamp_cc);
+	int err;
+	u16 phc_time[2];
+
+	err = mv88e6xxx_tai_read(chip, MV88E6XXX_TAI_TIME_LO, phc_time,
+				 ARRAY_SIZE(phc_time));
+	if (err)
+		return 0;
+	else
+		return ((u32)phc_time[1] << 16) | phc_time[0];
+}
+
+static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	if (scaled_ppm == 0)
+		return 0;
+
+	return -EOPNOTSUPP;
+}
+
+static int mv88e6xxx_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mv88e6xxx_chip *chip =
+		container_of(ptp, struct mv88e6xxx_chip, ptp_clock_info);
+
+	mutex_lock(&chip->reg_lock);
+	timecounter_adjtime(&chip->tstamp_tc, delta);
+	mutex_unlock(&chip->reg_lock);
+
+	return 0;
+}
+
+static int mv88e6xxx_ptp_gettime(struct ptp_clock_info *ptp,
+				 struct timespec64 *ts)
+{
+	struct mv88e6xxx_chip *chip =
+		container_of(ptp, struct mv88e6xxx_chip, ptp_clock_info);
+	u64 ns;
+
+	mutex_lock(&chip->reg_lock);
+	ns = timecounter_read(&chip->tstamp_tc);
+	chip->last_overflow_check = jiffies;
+	mutex_unlock(&chip->reg_lock);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int mv88e6xxx_ptp_settime(struct ptp_clock_info *ptp,
+				 const struct timespec64 *ts)
+{
+	struct mv88e6xxx_chip *chip =
+		container_of(ptp, struct mv88e6xxx_chip, ptp_clock_info);
+	u64 ns;
+
+	ns = timespec64_to_ns(ts);
+
+	mutex_lock(&chip->reg_lock);
+	timecounter_init(&chip->tstamp_tc, &chip->tstamp_cc, ns);
+	mutex_unlock(&chip->reg_lock);
+
+	return 0;
+}
+
+static int mv88e6xxx_ptp_enable(struct ptp_clock_info *ptp,
+				struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static int mv88e6xxx_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
+				enum ptp_pin_function func, unsigned int chan)
+{
+	return -EOPNOTSUPP;
+}
+
+/* The 32-bit timestamp counter overflows every ~34.3 seconds; this task
+ * forces periodic reads so that we don't miss any wraparounds.
+ */
+#define MV88E6XXX_TAI_OVERFLOW_PERIOD (34 * HZ / 2)
+static void mv88e6xxx_ptp_overflow_check(struct work_struct *work)
+{
+	struct delayed_work *dw = to_delayed_work(work);
+	struct mv88e6xxx_chip *chip =
+		container_of(dw, struct mv88e6xxx_chip, overflow_work);
+	bool timeout = time_is_before_jiffies(chip->last_overflow_check +
+					      MV88E6XXX_TAI_OVERFLOW_PERIOD);
+
+	if (timeout) {
+		mutex_lock(&chip->reg_lock);
+		timecounter_read(&chip->tstamp_tc);
+		chip->last_overflow_check = jiffies;
+		mutex_unlock(&chip->reg_lock);
+	}
+
+	schedule_delayed_work(&chip->overflow_work,
+			      MV88E6XXX_TAI_OVERFLOW_PERIOD);
+}
+
+int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
+{
+	/* Set up the cycle counter */
+	memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc));
+	chip->tstamp_cc.read	= mv88e6xxx_ptp_clock_read;
+	chip->tstamp_cc.mask	= CYCLECOUNTER_MASK(32);
+	/* Raw timestamps are in units of 8-ns clock periods. */
+	chip->tstamp_cc.mult	= 8;
+	chip->tstamp_cc.shift	= 0;
+
+	timecounter_init(&chip->tstamp_tc, &chip->tstamp_cc,
+			 ktime_to_ns(ktime_get_real()));
+
+	chip->last_overflow_check = jiffies;
+
+	INIT_DELAYED_WORK(&chip->overflow_work, mv88e6xxx_ptp_overflow_check);
+
+	chip->ptp_clock_info.owner = THIS_MODULE;
+	snprintf(chip->ptp_clock_info.name, sizeof(chip->ptp_clock_info.name),
+		 dev_name(chip->dev));
+	chip->ptp_clock_info.max_adj	= 0;
+
+	chip->ptp_clock_info.adjfine	= mv88e6xxx_ptp_adjfine;
+	chip->ptp_clock_info.adjtime	= mv88e6xxx_ptp_adjtime;
+	chip->ptp_clock_info.gettime64	= mv88e6xxx_ptp_gettime;
+	chip->ptp_clock_info.settime64	= mv88e6xxx_ptp_settime;
+	chip->ptp_clock_info.enable	= mv88e6xxx_ptp_enable;
+	chip->ptp_clock_info.verify	= mv88e6xxx_ptp_verify;
+
+	chip->ptp_clock = ptp_clock_register(&chip->ptp_clock_info, chip->dev);
+	if (IS_ERR(chip->ptp_clock))
+		return PTR_ERR(chip->ptp_clock);
+
+	schedule_delayed_work(&chip->overflow_work,
+			      MV88E6XXX_TAI_OVERFLOW_PERIOD);
+
+	return 0;
+}
+
+void mv88e6xxx_ptp_free(struct mv88e6xxx_chip *chip)
+{
+	if (chip->ptp_clock) {
+		cancel_delayed_work_sync(&chip->overflow_work);
+
+		ptp_clock_unregister(chip->ptp_clock);
+		chip->ptp_clock = NULL;
+	}
+}
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.h b/drivers/net/dsa/mv88e6xxx/ptp.h
new file mode 100644
index 0000000..c662953
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/ptp.h
@@ -0,0 +1,83 @@
+/*
+ * Marvell 88E6xxx Switch PTP support
+ *
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * Copyright (c) 2017 National Instruments
+ *      Erik Hons <erik.hons@ni.com>
+ *      Brandon Streiff <brandon.streiff@ni.com>
+ *      Dane Wagner <dane.wagner@ni.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _MV88E6XXX_PTP_H
+#define _MV88E6XXX_PTP_H
+
+#include "chip.h"
+
+/* Offset 0x00: TAI Global Config */
+#define MV88E6XXX_TAI_CFG			0x00
+
+/* Offset 0x01: Timestamp Clock Period (ps) */
+#define MV88E6XXX_TAI_CLOCK_PERIOD		0x01
+
+/* Offset 0x02/0x03: Trigger Generation Amount */
+#define MV88E6XXX_TAI_TRIG_GEN_AMOUNT_LO	0x02
+#define MV88E6XXX_TAI_TRIG_GEN_AMOUNT_HI	0x03
+
+/* Offset 0x04: Clock Compensation */
+#define MV88E6XXX_TAI_TRIG_CLOCK_COMP		0x04
+
+/* Offset 0x05: Trigger Configuration */
+#define MV88E6XXX_TAI_TRIG_CFG			0x05
+
+/* Offset 0x06: Ingress Rate Limiter Clock Generation Amount */
+#define MV88E6XXX_TAI_IRL_AMOUNT		0x06
+
+/* Offset 0x07: Ingress Rate Limiter Compensation */
+#define MV88E6XXX_TAI_IRL_COMP			0x07
+
+/* Offset 0x08: Ingress Rate Limiter Compensation */
+#define MV88E6XXX_TAI_IRL_COMP_PS		0x08
+
+/* Offset 0x09: Event Status */
+#define MV88E6XXX_TAI_EVENT_STATUS		0x09
+
+/* Offset 0x0A/0x0B: Event Time */
+#define MV88E6XXX_TAI_EVENT_TIME_LO		0x0a
+#define MV88E6XXX_TAI_EVENT_TYPE_HI		0x0b
+
+/* Offset 0x0E/0x0F: PTP Global Time */
+#define MV88E6XXX_TAI_TIME_LO			0x0e
+#define MV88E6XXX_TAI_TIME_HI			0x0f
+
+/* Offset 0x10/0x11: Trig Generation Time */
+#define MV88E6XXX_TAI_TRIG_TIME_LO		0x10
+#define MV88E6XXX_TAI_TRIG_TIME_HI		0x11
+
+/* Offset 0x12: Lock Status */
+#define MV88E6XXX_TAI_LOCK_STATUS		0x12
+
+#ifdef CONFIG_NET_DSA_MV88E6XXX_PTP
+
+int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip);
+void mv88e6xxx_ptp_free(struct mv88e6xxx_chip *chip);
+
+#else /* !CONFIG_NET_DSA_MV88E6XXX_PTP */
+
+static inline int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
+{
+	return -EOPNOTSUPP;
+}
+
+static void mv88e6xxx_ptp_free(struct mv88e6xxx_chip *chip)
+{
+}
+
+#endif /* CONFIG_NET_DSA_MV88E6XXX_PTP */
+
+#endif /* _MV88E6XXX_PTP_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next RFC 3/9] net: dsa: mv88e6xxx: add support for GPIO configuration
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

The Scratch/Misc register is a windowed interface that provides access
to the GPIO configuration. Provide a new method for configuration of
GPIO functions.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 13 +++++++
 drivers/net/dsa/mv88e6xxx/chip.h    |  8 +++++
 drivers/net/dsa/mv88e6xxx/global2.c | 71 +++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.h | 32 +++++++++++++++++
 4 files changed, 124 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2ed37d8..4a37b26 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3218,6 +3218,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6341",
 		.num_databases = 4096,
 		.num_ports = 6,
+		.num_gpio = 11,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3297,6 +3298,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6172",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3337,6 +3339,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6176",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3375,6 +3378,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6190",
 		.num_databases = 4096,
 		.num_ports = 11,	/* 10 + Z80 */
+		.num_gpio = 16,
 		.max_vid = 8191,
 		.port_base_addr = 0x0,
 		.global1_addr = 0x1b,
@@ -3395,6 +3399,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6190X",
 		.num_databases = 4096,
 		.num_ports = 11,	/* 10 + Z80 */
+		.num_gpio = 16,
 		.max_vid = 8191,
 		.port_base_addr = 0x0,
 		.global1_addr = 0x1b,
@@ -3436,6 +3441,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6240",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3457,6 +3463,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6290",
 		.num_databases = 4096,
 		.num_ports = 11,	/* 10 + Z80 */
+		.num_gpio = 16,
 		.max_vid = 8191,
 		.port_base_addr = 0x0,
 		.global1_addr = 0x1b,
@@ -3478,6 +3485,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6320",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3498,6 +3506,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6321",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3517,6 +3526,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6341",
 		.num_databases = 4096,
 		.num_ports = 6,
+		.num_gpio = 11,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3577,6 +3587,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6352",
 		.num_databases = 4096,
 		.num_ports = 7,
+		.num_gpio = 15,
 		.max_vid = 4095,
 		.port_base_addr = 0x10,
 		.global1_addr = 0x1b,
@@ -3597,6 +3608,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6390",
 		.num_databases = 4096,
 		.num_ports = 11,	/* 10 + Z80 */
+		.num_gpio = 16,
 		.max_vid = 8191,
 		.port_base_addr = 0x0,
 		.global1_addr = 0x1b,
@@ -3617,6 +3629,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.name = "Marvell 88E6390X",
 		.num_databases = 4096,
 		.num_ports = 11,	/* 10 + Z80 */
+		.num_gpio = 16,
 		.max_vid = 8191,
 		.port_base_addr = 0x0,
 		.global1_addr = 0x1b,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 648bd50..5f132e2 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -41,6 +41,8 @@
 #define MV88E6XXX_MAX_PVT_SWITCHES	32
 #define MV88E6XXX_MAX_PVT_PORTS		16
 
+#define MV88E6XXX_MAX_GPIO	16
+
 enum mv88e6xxx_egress_mode {
 	MV88E6XXX_EGRESS_MODE_UNMODIFIED,
 	MV88E6XXX_EGRESS_MODE_UNTAGGED,
@@ -107,6 +109,7 @@ struct mv88e6xxx_info {
 	const char *name;
 	unsigned int num_databases;
 	unsigned int num_ports;
+	unsigned int num_gpio;
 	unsigned int max_vid;
 	unsigned int port_base_addr;
 	unsigned int global1_addr;
@@ -417,6 +420,11 @@ static inline u16 mv88e6xxx_port_mask(struct mv88e6xxx_chip *chip)
 	return GENMASK(mv88e6xxx_num_ports(chip) - 1, 0);
 }
 
+static inline unsigned int mv88e6xxx_num_gpio(struct mv88e6xxx_chip *chip)
+{
+	return chip->info->num_gpio;
+}
+
 int mv88e6xxx_read(struct mv88e6xxx_chip *chip, int addr, int reg, u16 *val);
 int mv88e6xxx_write(struct mv88e6xxx_chip *chip, int addr, int reg, u16 val);
 int mv88e6xxx_update(struct mv88e6xxx_chip *chip, int addr, int reg,
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index b6d0c71..fe2c970 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -971,6 +971,77 @@ int mv88e6xxx_g2_smi_phy_write(struct mv88e6xxx_chip *chip, struct mii_bus *bus,
 						   val);
 }
 
+/* Offset 0x1A: Scratch and Misc. Register */
+static int mv88e6xxx_g2_scratch_reg_read(struct mv88e6xxx_chip *chip,
+					 int reg, u8 *data)
+{
+	int err;
+	u16 value;
+
+	err = mv88e6xxx_g2_write(chip, MV88E6XXX_G2_SCRATCH_MISC_MISC,
+				 reg << 8);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g2_read(chip, MV88E6XXX_G2_SCRATCH_MISC_MISC, &value);
+	if (err)
+		return err;
+
+	*data = (value & MV88E6XXX_G2_SCRATCH_MISC_DATA_MASK);
+
+	return 0;
+}
+
+static int mv88e6xxx_g2_scratch_reg_write(struct mv88e6xxx_chip *chip,
+					  int reg, u8 data)
+{
+	u16 value = (reg << 8) | data;
+
+	return mv88e6xxx_g2_update(chip, MV88E6XXX_G2_SCRATCH_MISC_MISC, value);
+}
+
+/* Configures the specified pin for the specified function. This function
+ * does not unset other pins configured for the same function. If multiple
+ * pins are configured for the same function, the lower-index pin gets
+ * that function and the higher-index pin goes back to being GPIO.
+ */
+int mv88e6xxx_g2_set_gpio_config(struct mv88e6xxx_chip *chip, int pin,
+				 int func, int dir)
+{
+	int mode_reg = MV88E6XXX_G2_SCRATCH_GPIO_MODE(pin);
+	int dir_reg = MV88E6XXX_G2_SCRATCH_GPIO_DIR(pin);
+	int err;
+	u8 val;
+
+	if (pin < 0 || pin >= mv88e6xxx_num_gpio(chip))
+		return -ERANGE;
+
+	/* Set function first */
+	err = mv88e6xxx_g2_scratch_reg_read(chip, mode_reg, &val);
+	if (err)
+		return err;
+
+	/* Zero bits in the field for this GPIO and OR in new config */
+	val &= ~MV88E6XXX_G2_SCRATCH_GPIO_MODE_MASK(pin);
+	val |= (func << MV88E6XXX_G2_SCRATCH_GPIO_MODE_OFFSET(pin));
+
+	err = mv88e6xxx_g2_scratch_reg_write(chip, mode_reg, val);
+	if (err)
+		return err;
+
+	/* Set direction */
+	err = mv88e6xxx_g2_scratch_reg_read(chip, dir_reg, &val);
+	if (err)
+		return err;
+
+	/* Zero bits in the field for this GPIO and OR in new config */
+	val &= ~MV88E6XXX_G2_SCRATCH_GPIO_DIR_MASK(pin);
+	val |= (dir << MV88E6XXX_G2_SCRATCH_GPIO_DIR_OFFSET(pin));
+
+	return mv88e6xxx_g2_scratch_reg_write(chip, dir_reg, val);
+}
+
+/* Offset 0x1B: Watchdog Control */
 static int mv88e6097_watchdog_action(struct mv88e6xxx_chip *chip, int irq)
 {
 	u16 reg;
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h
index 642d2b0..f192fc6 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -242,6 +242,29 @@
 #define MV88E6352_G2_NOEGR_POLICY	0x2000
 #define MV88E6390_G2_LAG_ID_4		0x2000
 
+/* Scratch/Misc registers accessed through MV88E6XXX_G2_SCRATCH_MISC */
+#define MV88E6XXX_G2_SCRATCH_GPIO_CONFIG_LO	0x60
+#define MV88E6XXX_G2_SCRATCH_GPIO_CONFIG_HI	0x61
+#define MV88E6XXX_G2_SCRATCH_GPIO_DIR(pin)	(0x62 + ((pin) / 8))
+#define MV88E6XXX_G2_SCRATCH_GPIO_DIR_OFFSET(pin) \
+			((pin) & 0x7)
+#define MV88E6XXX_G2_SCRATCH_GPIO_DIR_MASK(pin)	\
+			(1 << MV88E6XXX_G2_SCRATCH_GPIO_DIR_OFFSET(pin))
+#define MV88E6XXX_G2_SCRATCH_GPIO_DIR_OUT	0
+#define MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN	1
+#define MV88E6XXX_G2_SCRATCH_GPIO_DATA(pin)	(0x64 + ((pin) / 8))
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE(pin)	(0x68 + ((pin) / 2))
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_OFFSET(pin) \
+			((pin) & 0x1 ? 4 : 0)
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_MASK(pin) \
+			(0x7 << MV88E6XXX_G2_SCRATCH_GPIO_MODE_OFFSET(pin))
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_GPIO	0
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_TRIG	1
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_EVREQ	2
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_EXTCLK	3
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_RXCLK0	4
+#define MV88E6XXX_G2_SCRATCH_GPIO_MODE_RXCLK1	5
+
 #ifdef CONFIG_NET_DSA_MV88E6XXX_GLOBAL2
 
 static inline int mv88e6xxx_g2_require(struct mv88e6xxx_chip *chip)
@@ -270,6 +293,9 @@ int mv88e6xxx_g2_get_eeprom16(struct mv88e6xxx_chip *chip,
 int mv88e6xxx_g2_set_eeprom16(struct mv88e6xxx_chip *chip,
 			      struct ethtool_eeprom *eeprom, u8 *data);
 
+int mv88e6xxx_g2_set_gpio_config(struct mv88e6xxx_chip *chip, int pin,
+				 int func, int dir);
+
 int mv88e6xxx_g2_pvt_write(struct mv88e6xxx_chip *chip, int src_dev,
 			   int src_port, u16 data);
 int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip);
@@ -361,6 +387,12 @@ static inline int mv88e6xxx_g2_set_eeprom16(struct mv88e6xxx_chip *chip,
 	return -EOPNOTSUPP;
 }
 
+static inline int mv88e6xxx_g2_set_gpio_config(struct mv88e6xxx_chip *chip,
+					       int pin, int func, int dir)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int mv88e6xxx_g2_pvt_write(struct mv88e6xxx_chip *chip,
 					 int src_dev, int src_port, u16 data)
 {
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next RFC 4/9] net: dsa: mv88e6xxx: add support for event capture
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

This patch adds support for configuring mv88e6xxx GPIO lines as PTP
pins, so that they may be used for time stamping external events or for
periodic output.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 drivers/net/dsa/mv88e6xxx/chip.h |   4 +
 drivers/net/dsa/mv88e6xxx/ptp.c  | 317 ++++++++++++++++++++++++++++++++++++++-
 drivers/net/dsa/mv88e6xxx/ptp.h  |  16 ++
 3 files changed, 335 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 5f132e2..b763778 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -227,6 +227,10 @@ struct mv88e6xxx_chip {
 
 	struct ptp_clock	*ptp_clock;
 	struct ptp_clock_info	ptp_clock_info;
+	struct delayed_work	tai_event_work;
+	struct ptp_pin_desc	pin_config[MV88E6XXX_MAX_GPIO];
+	u16 trig_config;
+	u16 evcap_config;
 };
 
 struct mv88e6xxx_bus_ops {
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c
index 155f8e9..1422d85 100644
--- a/drivers/net/dsa/mv88e6xxx/ptp.c
+++ b/drivers/net/dsa/mv88e6xxx/ptp.c
@@ -18,6 +18,8 @@
 #include "global2.h"
 #include "ptp.h"
 
+#define TAI_EVENT_WORK_INTERVAL msecs_to_jiffies(100)
+
 static int mv88e6xxx_tai_read(struct mv88e6xxx_chip *chip, int addr,
 			      u16 *data, int len)
 {
@@ -27,6 +29,14 @@ static int mv88e6xxx_tai_read(struct mv88e6xxx_chip *chip, int addr,
 	return chip->info->ops->avb_ops->tai_read(chip, addr, data, len);
 }
 
+static int mv88e6xxx_tai_write(struct mv88e6xxx_chip *chip, int addr, u16 data)
+{
+	if (!chip->info->ops->avb_ops->tai_write)
+		return -EOPNOTSUPP;
+
+	return chip->info->ops->avb_ops->tai_write(chip, addr, data);
+}
+
 static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc)
 {
 	struct mv88e6xxx_chip *chip =
@@ -42,6 +52,144 @@ static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc)
 		return ((u32)phc_time[1] << 16) | phc_time[0];
 }
 
+static int mv88e6xxx_disable_trig(struct mv88e6xxx_chip *chip)
+{
+	int err;
+	u16 global_config;
+
+	chip->trig_config = 0;
+	global_config = (chip->evcap_config | chip->trig_config);
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_CFG, global_config);
+
+	return err;
+}
+
+static int mv88e6xxx_config_periodic_trig(struct mv88e6xxx_chip *chip,
+					  u32 ns, u16 picos)
+{
+	int err;
+	u16 global_config;
+
+	if (picos >= 1000)
+		return -ERANGE;
+
+	/* TRIG generation is in units of 8 ns clock periods. Convert ns
+	 * and ps into 8 ns clock periods and up to 8000 additional ps
+	 */
+	picos += (ns & 0x7) * 1000;
+	ns = ns >> 3;
+
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_TRIG_GEN_AMOUNT_LO,
+				  ns & 0xffff);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_TRIG_GEN_AMOUNT_HI,
+				  ns >> 16);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_TRIG_CLOCK_COMP,
+				  picos);
+	if (err)
+		return err;
+
+	chip->trig_config = MV88E6XXX_TAI_CFG_TRIG_ENABLE;
+	global_config = (chip->evcap_config | chip->trig_config);
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_CFG, global_config);
+
+	return err;
+}
+
+/* mv88e6xxx_config_eventcap - configure TAI event capture
+ * @event: PTP_CLOCK_PPS (internal) or PTP_CLOCK_EXTTS (external)
+ * @rising: zero for falling-edge trigger, else rising-edge trigger
+ *
+ * This will also reset the capture sequence counter.
+ */
+static int mv88e6xxx_config_eventcap(struct mv88e6xxx_chip *chip, int event,
+				     int rising)
+{
+	u16 global_config;
+	u16 cap_config;
+	int err;
+
+	chip->evcap_config = MV88E6XXX_TAI_CFG_CAP_OVERWRITE |
+			     MV88E6XXX_TAI_CFG_CAP_CTR_START;
+	if (!rising)
+		chip->evcap_config |= MV88E6XXX_TAI_CFG_EVREQ_FALLING;
+
+	global_config = (chip->evcap_config | chip->trig_config);
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_CFG, global_config);
+	if (err)
+		return err;
+
+	if (event == PTP_CLOCK_PPS) {
+		cap_config = MV88E6XXX_TAI_EVENT_STATUS_CAP_TRIG;
+	} else if (event == PTP_CLOCK_EXTTS) {
+		/* if STATUS_CAP_TRIG is unset we capture PTP_EVREQ events */
+		cap_config = 0;
+	} else {
+		return -EINVAL;
+	}
+
+	/* Write the capture config; this also clears the capture counter */
+	err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_EVENT_STATUS,
+				  cap_config);
+
+	return err;
+}
+
+static void mv88e6xxx_tai_event_work(struct work_struct *ugly)
+{
+	struct delayed_work *dw = to_delayed_work(ugly);
+	struct mv88e6xxx_chip *chip =
+		container_of(dw, struct mv88e6xxx_chip, tai_event_work);
+	u16 ev_status[4];
+	int err;
+
+	mutex_lock(&chip->reg_lock);
+
+	err = mv88e6xxx_tai_read(chip, MV88E6XXX_TAI_EVENT_STATUS,
+				 ev_status, ARRAY_SIZE(ev_status));
+	if (err) {
+		mutex_unlock(&chip->reg_lock);
+		return;
+	}
+
+	if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_ERROR)
+		dev_warn(chip->dev, "missed event capture\n");
+
+	if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_VALID) {
+		struct ptp_clock_event ev;
+		u32 raw_ts = ((u32)ev_status[2] << 16) | ev_status[1];
+
+		/* Clear the valid bit so the next timestamp can come in */
+		ev_status[0] &= ~MV88E6XXX_TAI_EVENT_STATUS_VALID;
+		err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_EVENT_STATUS,
+					  ev_status[0]);
+
+		if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_CAP_TRIG) {
+			/* TAI is configured to timestamp internal events.
+			 * This will be a PPS event.
+			 */
+			ev.type = PTP_CLOCK_PPS;
+		} else {
+			/* Otherwise this is an external timestamp */
+			ev.type = PTP_CLOCK_EXTTS;
+		}
+		/* We only have one timestamping channel. */
+		ev.index = 0;
+		ev.timestamp = timecounter_cyc2time(&chip->tstamp_tc, raw_ts);
+
+		ptp_clock_event(chip->ptp_clock, &ev);
+	}
+
+	mutex_unlock(&chip->reg_lock);
+
+	schedule_delayed_work(&chip->tai_event_work, TAI_EVENT_WORK_INTERVAL);
+}
+
 static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 {
 	if (scaled_ppm == 0)
@@ -95,16 +243,163 @@ static int mv88e6xxx_ptp_settime(struct ptp_clock_info *ptp,
 	return 0;
 }
 
+static int mv88e6xxx_ptp_enable_extts(struct mv88e6xxx_chip *chip,
+				      struct ptp_clock_request *rq, int on)
+{
+	int rising = (rq->extts.flags & PTP_RISING_EDGE);
+	int pin;
+	int err;
+
+	pin = ptp_find_pin(chip->ptp_clock, PTP_PF_EXTTS, rq->extts.index);
+
+	if (pin < 0)
+		return -EBUSY;
+
+	mutex_lock(&chip->reg_lock);
+
+	if (on) {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_EVREQ,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN);
+		if (err)
+			goto out;
+
+		schedule_delayed_work(&chip->tai_event_work,
+				      TAI_EVENT_WORK_INTERVAL);
+
+		err = mv88e6xxx_config_eventcap(chip, PTP_CLOCK_EXTTS,
+						rising);
+	} else {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_GPIO,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN);
+
+		cancel_delayed_work_sync(&chip->tai_event_work);
+	}
+
+out:
+	mutex_unlock(&chip->reg_lock);
+
+	return err;
+}
+
+static int mv88e6xxx_ptp_enable_perout(struct mv88e6xxx_chip *chip,
+				       struct ptp_clock_request *rq, int on)
+{
+	struct timespec ts;
+	u64 ns;
+	int pin;
+	int err;
+
+	pin = ptp_find_pin(chip->ptp_clock, PTP_PF_PEROUT, rq->extts.index);
+
+	if (pin < 0)
+		return -EBUSY;
+
+	ts.tv_sec = rq->perout.period.sec;
+	ts.tv_nsec = rq->perout.period.nsec;
+	ns = timespec_to_ns(&ts);
+
+	if (ns > U32_MAX)
+		return -ERANGE;
+
+	mutex_lock(&chip->reg_lock);
+
+	err = mv88e6xxx_config_periodic_trig(chip, (u32)ns, 0);
+	if (err)
+		goto out;
+
+	if (on) {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_TRIG,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_OUT);
+	} else {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_GPIO,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN);
+	}
+
+out:
+	mutex_unlock(&chip->reg_lock);
+
+	return err;
+}
+
+static int mv88e6xxx_ptp_enable_pps(struct mv88e6xxx_chip *chip,
+				    struct ptp_clock_request *rq, int on)
+{
+	int pin;
+	int err;
+
+	pin = ptp_find_pin(chip->ptp_clock, PTP_PF_PEROUT, rq->extts.index);
+
+	if (pin < 0)
+		return -EBUSY;
+
+	mutex_lock(&chip->reg_lock);
+
+	if (on) {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_TRIG,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_OUT);
+		if (err)
+			goto out;
+		err = mv88e6xxx_config_periodic_trig(chip,
+						     NSEC_PER_SEC, 0);
+		if (err)
+			goto out;
+
+		schedule_delayed_work(&chip->tai_event_work, 0);
+
+		err = mv88e6xxx_config_eventcap(chip, PTP_CLOCK_PPS, 1);
+	} else {
+		err = mv88e6xxx_g2_set_gpio_config(
+			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_GPIO,
+			MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN);
+		if (err)
+			goto out;
+
+		err = mv88e6xxx_disable_trig(chip);
+
+		cancel_delayed_work_sync(&chip->tai_event_work);
+	}
+
+out:
+	mutex_unlock(&chip->reg_lock);
+
+	return err;
+}
+
 static int mv88e6xxx_ptp_enable(struct ptp_clock_info *ptp,
 				struct ptp_clock_request *rq, int on)
 {
-	return -EOPNOTSUPP;
+	struct mv88e6xxx_chip *chip =
+		container_of(ptp, struct mv88e6xxx_chip, ptp_clock_info);
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_EXTTS:
+		return mv88e6xxx_ptp_enable_extts(chip, rq, on);
+	case PTP_CLK_REQ_PEROUT:
+		return mv88e6xxx_ptp_enable_perout(chip, rq, on);
+	case PTP_CLK_REQ_PPS:
+		return mv88e6xxx_ptp_enable_pps(chip, rq, on);
+	default:
+		return -EOPNOTSUPP;
+	}
 }
 
 static int mv88e6xxx_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
 				enum ptp_pin_function func, unsigned int chan)
 {
-	return -EOPNOTSUPP;
+	switch (func) {
+	case PTP_PF_NONE:
+	case PTP_PF_EXTTS:
+	case PTP_PF_PEROUT:
+		break;
+	case PTP_PF_PHYSYNC:
+		return -EOPNOTSUPP;
+	}
+	return 0;
 }
 
 /* The 32-bit timestamp counter overflows every ~34.3 seconds; this task
@@ -132,6 +427,8 @@ static void mv88e6xxx_ptp_overflow_check(struct work_struct *work)
 
 int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
 {
+	int i;
+
 	/* Set up the cycle counter */
 	memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc));
 	chip->tstamp_cc.read	= mv88e6xxx_ptp_clock_read;
@@ -146,12 +443,27 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
 	chip->last_overflow_check = jiffies;
 
 	INIT_DELAYED_WORK(&chip->overflow_work, mv88e6xxx_ptp_overflow_check);
+	INIT_DELAYED_WORK(&chip->tai_event_work, mv88e6xxx_tai_event_work);
 
 	chip->ptp_clock_info.owner = THIS_MODULE;
 	snprintf(chip->ptp_clock_info.name, sizeof(chip->ptp_clock_info.name),
 		 dev_name(chip->dev));
 	chip->ptp_clock_info.max_adj	= 0;
 
+	chip->ptp_clock_info.n_ext_ts	= 1;
+	chip->ptp_clock_info.n_per_out	= 1;
+	chip->ptp_clock_info.n_pins	= mv88e6xxx_num_gpio(chip);
+	chip->ptp_clock_info.pps	= 1;
+
+	for (i = 0; i < chip->ptp_clock_info.n_pins; ++i) {
+		struct ptp_pin_desc *ppd = &chip->pin_config[i];
+
+		snprintf(ppd->name, sizeof(ppd->name), "mv88e6xxx_gpio%d", i);
+		ppd->index = i;
+		ppd->func = PTP_PF_NONE;
+	}
+	chip->ptp_clock_info.pin_config = chip->pin_config;
+
 	chip->ptp_clock_info.adjfine	= mv88e6xxx_ptp_adjfine;
 	chip->ptp_clock_info.adjtime	= mv88e6xxx_ptp_adjtime;
 	chip->ptp_clock_info.gettime64	= mv88e6xxx_ptp_gettime;
@@ -173,6 +485,7 @@ void mv88e6xxx_ptp_free(struct mv88e6xxx_chip *chip)
 {
 	if (chip->ptp_clock) {
 		cancel_delayed_work_sync(&chip->overflow_work);
+		cancel_delayed_work_sync(&chip->tai_event_work);
 
 		ptp_clock_unregister(chip->ptp_clock);
 		chip->ptp_clock = NULL;
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.h b/drivers/net/dsa/mv88e6xxx/ptp.h
index c662953..33b1842 100644
--- a/drivers/net/dsa/mv88e6xxx/ptp.h
+++ b/drivers/net/dsa/mv88e6xxx/ptp.h
@@ -21,6 +21,18 @@
 
 /* Offset 0x00: TAI Global Config */
 #define MV88E6XXX_TAI_CFG			0x00
+#define MV88E6XXX_TAI_CFG_CAP_OVERWRITE		0x8000
+#define MV88E6XXX_TAI_CFG_CAP_CTR_START		0x4000
+#define MV88E6XXX_TAI_CFG_EVREQ_FALLING		0x2000
+#define MV88E6XXX_TAI_CFG_TRIG_ACTIVE_LO	0x1000
+#define MV88E6XXX_TAI_CFG_IRL_ENABLE		0x0400
+#define MV88E6XXX_TAI_CFG_TRIG_IRQ_EN		0x0200
+#define MV88E6XXX_TAI_CFG_EVREQ_IRQ_EN		0x0100
+#define MV88E6XXX_TAI_CFG_TRIG_LOCK		0x0080
+#define MV88E6XXX_TAI_CFG_BLOCK_UPDATE		0x0008
+#define MV88E6XXX_TAI_CFG_MULTI_PTP		0x0004
+#define MV88E6XXX_TAI_CFG_TRIG_MODE_ONESHOT	0x0002
+#define MV88E6XXX_TAI_CFG_TRIG_ENABLE		0x0001
 
 /* Offset 0x01: Timestamp Clock Period (ps) */
 #define MV88E6XXX_TAI_CLOCK_PERIOD		0x01
@@ -46,6 +58,10 @@
 
 /* Offset 0x09: Event Status */
 #define MV88E6XXX_TAI_EVENT_STATUS		0x09
+#define MV88E6XXX_TAI_EVENT_STATUS_CAP_TRIG	0x4000
+#define MV88E6XXX_TAI_EVENT_STATUS_ERROR	0x0200
+#define MV88E6XXX_TAI_EVENT_STATUS_VALID	0x0100
+#define MV88E6XXX_TAI_EVENT_STATUS_CTR_MASK	0x00ff
 
 /* Offset 0x0A/0x0B: Event Time */
 #define MV88E6XXX_TAI_EVENT_TIME_LO		0x0a
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next RFC 5/9] net: dsa: forward hardware timestamping ioctls to switch driver
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

This patch adds support to the dsa slave network device so that
switch drivers can implement the SIOC[GS]HWTSTAMP ioctls and the
ethtool timestamp-info interface.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 include/net/dsa.h | 15 +++++++++++++++
 net/dsa/slave.c   | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8dee216..1163af1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/of.h>
 #include <linux/ethtool.h>
+#include <linux/net_tstamp.h>
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
@@ -329,6 +330,12 @@ struct dsa_switch_ops {
 			   struct ethtool_wolinfo *w);
 
 	/*
+	 * ethtool timestamp info
+	 */
+	int	(*get_ts_info)(struct dsa_switch *ds, int port,
+			       struct ethtool_ts_info *ts);
+
+	/*
 	 * Suspend and resume
 	 */
 	int	(*suspend)(struct dsa_switch *ds);
@@ -434,6 +441,14 @@ struct dsa_switch_ops {
 					 int port, struct net_device *br);
 	void	(*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index,
 					  int port, struct net_device *br);
+
+	/*
+	 * PTP functionality
+	 */
+	int	(*port_hwtstamp_get)(struct dsa_switch *ds, int port,
+				     struct ifreq *ifr);
+	int	(*port_hwtstamp_set)(struct dsa_switch *ds, int port,
+				     struct ifreq *ifr);
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index bf8800d..2cf6a83 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -264,10 +264,34 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+	int port = p->dp->index;
+
 	if (!dev->phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(dev->phydev, ifr, cmd);
+	switch (cmd) {
+	case SIOCGMIIPHY:
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		if (dev->phydev)
+			return phy_mii_ioctl(dev->phydev, ifr, cmd);
+		else
+			return -EOPNOTSUPP;
+	case SIOCGHWTSTAMP:
+		if (ds->ops->port_hwtstamp_get)
+			return ds->ops->port_hwtstamp_get(ds, port, ifr);
+		else
+			return -EOPNOTSUPP;
+	case SIOCSHWTSTAMP:
+		if (ds->ops->port_hwtstamp_set)
+			return ds->ops->port_hwtstamp_set(ds, port, ifr);
+		else
+			return -EOPNOTSUPP;
+	default:
+		return -EOPNOTSUPP;
+	}
 }
 
 static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -876,6 +900,18 @@ static int dsa_slave_set_rxnfc(struct net_device *dev,
 	return ds->ops->set_rxnfc(ds, p->dp->index, nfc);
 }
 
+static int dsa_slave_get_ts_info(struct net_device *dev,
+				 struct ethtool_ts_info *ts)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_ts_info)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
 static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
@@ -896,6 +932,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
 	.set_rxnfc		= dsa_slave_set_rxnfc,
+	.get_ts_info		= dsa_slave_get_ts_info,
 };
 
 static const struct net_device_ops dsa_slave_netdev_ops = {
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next RFC 6/9] net: dsa: forward timestamping callbacks to switch drivers
From: Brandon Streiff @ 2017-09-28 15:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, David S. Miller, Florian Fainelli, Andrew Lunn,
	Vivien Didelot, Richard Cochran, Erik Hons, Brandon Streiff
In-Reply-To: <1506612341-18061-1-git-send-email-brandon.streiff@ni.com>

Forward the rx/tx timestamp machinery from the dsa infrastructure to the
switch driver.

On the rx side, defer delivery of skbs until we have an rx timestamp.
This mimicks the behavior of skb_defer_rx_timestamp. The implementation
does have to thread through the tagging protocol handlers because
it is where that we know which switch and port the skb goes to.

On the tx side, identify PTP packets, clone them, and pass them to the
underlying switch driver before we transmit. This mimicks the behavior
of skb_tx_timestamp.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
---
 include/net/dsa.h     | 13 +++++++++++--
 net/dsa/dsa.c         | 39 ++++++++++++++++++++++++++++++++++++++-
 net/dsa/slave.c       | 25 +++++++++++++++++++++++++
 net/dsa/tag_brcm.c    |  6 +++++-
 net/dsa/tag_dsa.c     |  6 +++++-
 net/dsa/tag_edsa.c    |  6 +++++-
 net/dsa/tag_ksz.c     |  6 +++++-
 net/dsa/tag_lan9303.c |  6 +++++-
 net/dsa/tag_mtk.c     |  6 +++++-
 net/dsa/tag_qca.c     |  6 +++++-
 net/dsa/tag_trailer.c |  6 +++++-
 11 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1163af1..4daf7f7 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -101,11 +101,14 @@ struct dsa_platform_data {
 };
 
 struct packet_type;
+struct dsa_switch;
 
 struct dsa_device_ops {
 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt);
+			       struct packet_type *pt,
+			       struct dsa_switch **src_dev,
+			       int *src_port);
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
 };
@@ -134,7 +137,9 @@ struct dsa_switch_tree {
 	/* Copy of tag_ops->rcv for faster access in hot path */
 	struct sk_buff *	(*rcv)(struct sk_buff *skb,
 				       struct net_device *dev,
-				       struct packet_type *pt);
+				       struct packet_type *pt,
+				       struct dsa_switch **src_dev,
+				       int *src_port);
 
 	/*
 	 * The switch port to which the CPU is attached.
@@ -449,6 +454,10 @@ struct dsa_switch_ops {
 				     struct ifreq *ifr);
 	int	(*port_hwtstamp_set)(struct dsa_switch *ds, int port,
 				     struct ifreq *ifr);
+	void	(*port_txtstamp)(struct dsa_switch *ds, int port,
+				 struct sk_buff *clone, unsigned int type);
+	bool	(*port_rxtstamp)(struct dsa_switch *ds, int port,
+				 struct sk_buff *skb, unsigned int type);
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 81c852e..42e7286 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -22,6 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/sysfs.h>
 #include <linux/phy_fixed.h>
+#include <linux/ptp_classify.h>
 #include <linux/gpio/consumer.h>
 #include <linux/etherdevice.h>
 
@@ -157,6 +158,37 @@ struct net_device *dsa_dev_to_net_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
+/* Determine if we should defer delivery of skb until we have a rx timestamp.
+ *
+ * Called from dsa_switch_rcv. For now, this will only work if tagging is
+ * enabled on the switch. Normally the MAC driver would retrieve the hardware
+ * timestamp when it reads the packet out of the hardware. However in a DSA
+ * switch, the DSA driver owning the interface to which the packet is
+ * delivered is never notified unless we do so here.
+ */
+static bool dsa_skb_defer_rx_timestamp(struct dsa_switch *ds, int port,
+				       struct sk_buff *skb)
+{
+	unsigned int type;
+
+	if (skb_headroom(skb) < ETH_HLEN)
+		return false;
+
+	__skb_push(skb, ETH_HLEN);
+
+	type = ptp_classify_raw(skb);
+
+	__skb_pull(skb, ETH_HLEN);
+
+	if (type == PTP_CLASS_NONE)
+		return false;
+
+	if (likely(ds->ops->port_rxtstamp))
+		return ds->ops->port_rxtstamp(ds, port, skb, type);
+
+	return false;
+}
+
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 			  struct packet_type *pt, struct net_device *unused)
 {
@@ -164,6 +196,8 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct sk_buff *nskb = NULL;
 	struct pcpu_sw_netstats *s;
 	struct dsa_slave_priv *p;
+	struct dsa_switch *ds = NULL;
+	int source_port;
 
 	if (unlikely(dst == NULL)) {
 		kfree_skb(skb);
@@ -174,7 +208,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = dst->rcv(skb, dev, pt);
+	nskb = dst->rcv(skb, dev, pt, &ds, &source_port);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
@@ -192,6 +226,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	s->rx_bytes += skb->len;
 	u64_stats_update_end(&s->syncp);
 
+	if (dsa_skb_defer_rx_timestamp(ds, source_port, skb))
+		return 0;
+
 	netif_receive_skb(skb);
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2cf6a83..a278335 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -22,6 +22,7 @@
 #include <net/tc_act/tc_mirred.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
+#include <linux/ptp_classify.h>
 
 #include "dsa_priv.h"
 
@@ -407,6 +408,25 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
 	return NETDEV_TX_OK;
 }
 
+static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
+				 struct sk_buff *skb)
+{
+	struct dsa_switch *ds = p->dp->ds;
+	struct sk_buff *clone;
+	unsigned int type;
+
+	type = ptp_classify_raw(skb);
+	if (type == PTP_CLASS_NONE)
+		return;
+
+	if (ds->ops->port_txtstamp) {
+		clone = skb_clone_sk(skb);
+		if (!clone)
+			return;
+		ds->ops->port_txtstamp(ds, p->dp->index, clone, type);
+	}
+}
+
 static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -419,6 +439,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	s->tx_bytes += skb->len;
 	u64_stats_update_end(&s->syncp);
 
+	/* Identify PTP protocol packets, clone them, and pass them to the
+	 * switch driver
+	 */
+	dsa_skb_tx_timestamp(p, skb);
+
 	/* Transmit function may have to reallocate the original SKB,
 	 * in which case it must have freed it. Only free it here on error.
 	 */
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index dbb0164..1c8b1c9 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -90,7 +90,8 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 }
 
 static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				    struct packet_type *pt)
+				    struct packet_type *pt,
+				    struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
@@ -131,6 +132,9 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index fbf9ca9..2abcd01 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -65,7 +65,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt)
+			       struct packet_type *pt,
+			       struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
@@ -155,6 +156,9 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 76367ba..bc529e5 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -78,7 +78,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
-				struct packet_type *pt)
+				struct packet_type *pt,
+				struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
@@ -174,6 +175,9 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 010ca0a..7760862 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -78,7 +78,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt)
+			       struct packet_type *pt,
+			       struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
@@ -96,6 +97,9 @@ static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 0b98261..adabd48 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -68,7 +68,8 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt)
+			struct packet_type *pt,
+			struct dsa_switch **src_dev, int *src_port)
 {
 	u16 *lan9303_tag;
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
@@ -123,6 +124,9 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* forward the packet to the dedicated interface */
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index ec8ee5f..321d1a6 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -44,7 +44,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+				   struct packet_type *pt,
+				   struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
@@ -83,6 +84,9 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[port].netdev;
 
+	*src_dev = ds;
+	*src_port = port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 1d4c707..de51fe6e 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -63,7 +63,8 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+				   struct packet_type *pt,
+				   struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
@@ -107,6 +108,9 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* Update skb & forward the frame accordingly */
 	skb->dev = ds->ports[port].netdev;
 
+	*src_dev = ds;
+	*src_port = port;
+
 	return skb;
 }
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d2fd492..ca5960f 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -56,7 +56,8 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt)
+				   struct packet_type *pt,
+				   struct dsa_switch **src_dev, int *src_port)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
@@ -80,6 +81,9 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb->dev = ds->ports[source_port].netdev;
 
+	*src_dev = ds;
+	*src_port = source_port;
+
 	return skb;
 }
 
-- 
2.1.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox