Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2] xfrm: add support of ESN and anti-replay window
From: Nicolas Dichtel @ 2014-10-20  9:23 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, dingzhi, Adrien Mazarguil, Nicolas Dichtel

From: dingzhi <zhi.ding@6wind.com>

This patch allows to configure ESN and anti-replay window.

Signed-off-by: dingzhi <zhi.ding@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 ip/ipxfrm.c     | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ip/xfrm_state.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/ip/ipxfrm.c b/ip/ipxfrm.c
index f5f78ca6b968..659fa6b64579 100644
--- a/ip/ipxfrm.c
+++ b/ip/ipxfrm.c
@@ -806,6 +806,62 @@ void xfrm_xfrma_print(struct rtattr *tb[], __u16 family,
 		fprintf(fp, "%s", _SL_);
 	}
 
+	if (tb[XFRMA_REPLAY_VAL]) {
+		struct xfrm_replay_state *replay;
+
+		if (prefix)
+			fputs(prefix, fp);
+		fprintf(fp, "anti-replay context: ");
+
+		if (RTA_PAYLOAD(tb[XFRMA_REPLAY_VAL]) < sizeof(*replay)) {
+			fprintf(fp, "(ERROR truncated)");
+			fprintf(fp, "%s", _SL_);
+			return;
+		}
+
+		replay = (struct xfrm_replay_state *)RTA_DATA(tb[XFRMA_REPLAY_VAL]);
+		fprintf(fp, "seq 0x%x, oseq 0x%x, bitmap 0x%08x",
+			replay->seq, replay->oseq, replay->bitmap);
+		fprintf(fp, "%s", _SL_);
+	}
+
+	if (tb[XFRMA_REPLAY_ESN_VAL]) {
+		struct xfrm_replay_state_esn *replay;
+		unsigned int i, j;
+
+		if (prefix)
+			fputs(prefix, fp);
+		fprintf(fp, "anti-replay esn context:");
+
+		if (RTA_PAYLOAD(tb[XFRMA_REPLAY_ESN_VAL]) < sizeof(*replay)) {
+			fprintf(fp, "(ERROR truncated)");
+			fprintf(fp, "%s", _SL_);
+			return;
+		}
+		fprintf(fp, "%s", _SL_);
+
+		replay = (struct xfrm_replay_state_esn *)RTA_DATA(tb[XFRMA_REPLAY_ESN_VAL]);
+		if (prefix)
+			fputs(prefix, fp);
+		fprintf(fp, " seq-hi 0x%x, seq 0x%x, oseq-hi 0x%0x, oseq 0x%0x",
+			replay->seq_hi, replay->seq, replay->oseq_hi,
+			replay->oseq);
+		fprintf(fp, "%s", _SL_);
+		if (prefix)
+			fputs(prefix, fp);
+		fprintf(fp, " replay_window %u, bitmap-length %u",
+			replay->replay_window, replay->bmp_len);
+		for (i = replay->bmp_len, j = 0; i; i--) {
+			if (j++ % 8 == 0) {
+				fprintf(fp, "%s", _SL_);
+				if (prefix)
+					fputs(prefix, fp);
+				fprintf(fp, " ");
+			}
+			fprintf(fp, "%08x ", replay->bmp[i - 1]);
+		}
+		fprintf(fp, "%s", _SL_);
+	}
 }
 
 static int xfrm_selector_iszero(struct xfrm_selector *s)
@@ -849,6 +905,7 @@ void xfrm_state_info_print(struct xfrm_usersa_info *xsinfo,
 		XFRM_FLAG_PRINT(fp, flags, XFRM_STATE_ICMP, "icmp");
 		XFRM_FLAG_PRINT(fp, flags, XFRM_STATE_AF_UNSPEC, "af-unspec");
 		XFRM_FLAG_PRINT(fp, flags, XFRM_STATE_ALIGN4, "align4");
+		XFRM_FLAG_PRINT(fp, flags, XFRM_STATE_ESN, "esn");
 		if (flags)
 			fprintf(fp, "%x", flags);
 	}
diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c
index fe7708e533f3..2ad3d8d37fbe 100644
--- a/ip/xfrm_state.c
+++ b/ip/xfrm_state.c
@@ -58,6 +58,7 @@ static void usage(void)
 	fprintf(stderr, "Usage: ip xfrm state { add | update } ID [ ALGO-LIST ] [ mode MODE ]\n");
 	fprintf(stderr, "        [ mark MARK [ mask MASK ] ] [ reqid REQID ] [ seq SEQ ]\n");
 	fprintf(stderr, "        [ replay-window SIZE ] [ replay-seq SEQ ] [ replay-oseq SEQ ]\n");
+	fprintf(stderr, "        [ replay-seq-hi SEQ ] [ replay-oseq-hi SEQ ]\n");
 	fprintf(stderr, "        [ flag FLAG-LIST ] [ sel SELECTOR ] [ LIMIT-LIST ] [ encap ENCAP ]\n");
 	fprintf(stderr, "        [ coa ADDR[/PLEN] ] [ ctx CTX ] [ extra-flag EXTRA-FLAG-LIST ]\n");
 	fprintf(stderr, "Usage: ip xfrm state allocspi ID [ mode MODE ] [ mark MARK [ mask MASK ] ]\n");
@@ -87,7 +88,7 @@ static void usage(void)
 	fprintf(stderr, " ALGO-NAME\n");
 	fprintf(stderr, "MODE := transport | tunnel | beet | ro | in_trigger\n");
 	fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
-	fprintf(stderr, "FLAG := noecn | decap-dscp | nopmtudisc | wildrecv | icmp | af-unspec | align4\n");
+	fprintf(stderr, "FLAG := noecn | decap-dscp | nopmtudisc | wildrecv | icmp | af-unspec | align4 | esn\n");
 	fprintf(stderr, "EXTRA-FLAG-LIST := [ EXTRA-FLAG-LIST ] EXTRA-FLAG\n");
 	fprintf(stderr, "EXTRA-FLAG := dont-encap-dscp\n");
 	fprintf(stderr, "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n");
@@ -214,6 +215,8 @@ static int xfrm_state_flag_parse(__u8 *flags, int *argcp, char ***argvp)
 				*flags |= XFRM_STATE_AF_UNSPEC;
 			else if (strcmp(*argv, "align4") == 0)
 				*flags |= XFRM_STATE_ALIGN4;
+			else if (strcmp(*argv, "esn") == 0)
+				*flags |= XFRM_STATE_ESN;
 			else {
 				PREV_ARG(); /* back track */
 				break;
@@ -273,6 +276,9 @@ static int xfrm_state_modify(int cmd, unsigned flags, int argc, char **argv)
 		char  			buf[RTA_BUF_SIZE];
 	} req;
 	struct xfrm_replay_state replay;
+	struct xfrm_replay_state_esn replay_esn;
+	__u32 replay_window = 0;
+	__u32 seq = 0, oseq = 0, seq_hi = 0, oseq_hi = 0;
 	char *idp = NULL;
 	char *aeadop = NULL;
 	char *ealgop = NULL;
@@ -289,6 +295,7 @@ static int xfrm_state_modify(int cmd, unsigned flags, int argc, char **argv)
 
 	memset(&req, 0, sizeof(req));
 	memset(&replay, 0, sizeof(replay));
+	memset(&replay_esn, 0, sizeof(replay_esn));
 	memset(&ctx, 0, sizeof(ctx));
 
 	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.xsinfo));
@@ -315,16 +322,24 @@ static int xfrm_state_modify(int cmd, unsigned flags, int argc, char **argv)
 			xfrm_seq_parse(&req.xsinfo.seq, &argc, &argv);
 		} else if (strcmp(*argv, "replay-window") == 0) {
 			NEXT_ARG();
-			if (get_u8(&req.xsinfo.replay_window, *argv, 0))
+			if (get_u32(&replay_window, *argv, 0))
 				invarg("value after \"replay-window\" is invalid", *argv);
 		} else if (strcmp(*argv, "replay-seq") == 0) {
 			NEXT_ARG();
-			if (get_u32(&replay.seq, *argv, 0))
+			if (get_u32(&seq, *argv, 0))
 				invarg("value after \"replay-seq\" is invalid", *argv);
+		} else if (strcmp(*argv, "replay-seq-hi") == 0) {
+			NEXT_ARG();
+			if (get_u32(&seq_hi, *argv, 0))
+				invarg("value after \"replay-seq-hi\" is invalid", *argv);
 		} else if (strcmp(*argv, "replay-oseq") == 0) {
 			NEXT_ARG();
-			if (get_u32(&replay.oseq, *argv, 0))
+			if (get_u32(&oseq, *argv, 0))
 				invarg("value after \"replay-oseq\" is invalid", *argv);
+		} else if (strcmp(*argv, "replay-oseq-hi") == 0) {
+			NEXT_ARG();
+			if (get_u32(&oseq_hi, *argv, 0))
+				invarg("value after \"replay-oseq-hi\" is invalid", *argv);
 		} else if (strcmp(*argv, "flag") == 0) {
 			NEXT_ARG();
 			xfrm_state_flag_parse(&req.xsinfo.flags, &argc, &argv);
@@ -514,9 +529,39 @@ static int xfrm_state_modify(int cmd, unsigned flags, int argc, char **argv)
 		argc--; argv++;
 	}
 
-	if (replay.seq || replay.oseq)
-		addattr_l(&req.n, sizeof(req.buf), XFRMA_REPLAY_VAL,
-			  (void *)&replay, sizeof(replay));
+	if (req.xsinfo.flags & XFRM_STATE_ESN &&
+	    replay_window == 0) {
+		fprintf(stderr, "Error: esn flag set without replay-window.\n");
+		exit(-1);
+	}
+
+	if (replay_window > XFRMA_REPLAY_ESN_MAX) {
+		fprintf(stderr,
+			"Error: replay-window (%u) > XFRMA_REPLAY_ESN_MAX (%u).\n",
+			replay_window, XFRMA_REPLAY_ESN_MAX);
+		exit(-1);
+	}
+
+	if (req.xsinfo.flags & XFRM_STATE_ESN ||
+	    replay_window > (sizeof(replay.bitmap) * 8)) {
+		replay_esn.seq = seq;
+		replay_esn.oseq = oseq;
+		replay_esn.seq_hi = seq_hi;
+		replay_esn.oseq_hi = oseq_hi;
+		replay_esn.replay_window = replay_window;
+		replay_esn.bmp_len = (replay_window + sizeof(__u32) * 8 - 1) /
+				     (sizeof(__u32) * 8);
+		addattr_l(&req.n, sizeof(req.buf), XFRMA_REPLAY_ESN_VAL,
+			  &replay_esn, sizeof(replay_esn));
+	} else {
+		if (seq || oseq) {
+			replay.seq = seq;
+			replay.oseq = oseq;
+			addattr_l(&req.n, sizeof(req.buf), XFRMA_REPLAY_VAL,
+				  &replay, sizeof(replay));
+		}
+		req.xsinfo.replay_window = replay_window;
+	}
 
 	if (extra_flags)
 		addattr32(&req.n, sizeof(req.buf), XFRMA_SA_EXTRA_FLAGS,
-- 
2.1.0

^ permalink raw reply related

* [PATCH iproute2] ip monitor: Allow to filter events by dev
From: Vadim Kochan @ 2014-10-20  9:25 UTC (permalink / raw)
  To: netdev; +Cc: Vadim Kochan

Added 'dev' option to allow filtering events by device.

Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
 ip/ip_common.h        | 13 ++++++++-----
 ip/ipaddress.c        |  5 +++--
 ip/ipmonitor.c        | 20 +++++++++++++++-----
 ip/ipmroute.c         |  5 +++--
 ip/ipneigh.c          |  5 +++--
 ip/ipnetconf.c        |  5 +++--
 ip/iproute.c          |  9 ++++++---
 man/man8/ip-monitor.8 |  9 +++++++++
 8 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 8351463..2fb3aa6 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -13,14 +13,17 @@ extern int print_ntable(const struct sockaddr_nl *who,
 			struct nlmsghdr *n, void *arg);
 extern int ipaddr_list(int argc, char **argv);
 extern int ipaddr_list_link(int argc, char **argv);
+void ipaddr_get_vf_rate(int, int *, int *, int);
 extern int iproute_monitor(int argc, char **argv);
 extern void iplink_usage(void) __attribute__((noreturn));
-extern void iproute_reset_filter(void);
-extern void ipmroute_reset_filter(void);
-void ipaddr_get_vf_rate(int, int *, int *, int);
-extern void ipaddr_reset_filter(int);
-extern void ipneigh_reset_filter(void);
+
+extern void iproute_reset_filter(int ifindex);
+extern void ipmroute_reset_filter(int ifindex);
+extern void ipaddr_reset_filter(int oneline, int ifindex);
+extern void ipneigh_reset_filter(int ifindex);
 extern void ipntable_reset_filter(void);
+extern void ipnetconf_reset_filter(int ifindex);
+
 extern int print_route(const struct sockaddr_nl *who,
 		       struct nlmsghdr *n, void *arg);
 extern int print_mroute(const struct sockaddr_nl *who,
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 45729d8..6708c2c 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1135,7 +1135,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action)
 	char *filter_dev = NULL;
 	int no_link = 0;
 
-	ipaddr_reset_filter(oneline);
+	ipaddr_reset_filter(oneline, 0);
 	filter.showqueue = 1;
 
 	if (filter.family == AF_UNSPEC)
@@ -1380,10 +1380,11 @@ int ipaddr_list_link(int argc, char **argv)
 	return ipaddr_list_flush_or_save(argc, argv, IPADD_LIST);
 }
 
-void ipaddr_reset_filter(int oneline)
+void ipaddr_reset_filter(int oneline, int ifindex)
 {
 	memset(&filter, 0, sizeof(filter));
 	filter.oneline = oneline;
+	filter.ifindex = ifindex;
 }
 
 static int default_scope(inet_prefix *lcl)
diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index b7b2d98..4cc75f4 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -29,7 +29,8 @@ int prefix_banner;
 
 static void usage(void)
 {
-	fprintf(stderr, "Usage: ip monitor [ all | LISTofOBJECTS ] [ FILE ] [ label ]\n");
+	fprintf(stderr, "Usage: ip monitor [ all | LISTofOBJECTS ] [ FILE ]"
+			"[ label ] [dev DEVICE]\n");
 	fprintf(stderr, "LISTofOBJECTS := link | address | route | mroute | prefix |\n");
 	fprintf(stderr, "                 neigh | netconf\n");
 	fprintf(stderr, "FILE := file FILENAME\n");
@@ -162,12 +163,9 @@ int do_ipmonitor(int argc, char **argv)
 	int lprefix=0;
 	int lneigh=0;
 	int lnetconf=0;
+	int ifindex=0;
 
 	rtnl_close(&rth);
-	ipaddr_reset_filter(1);
-	iproute_reset_filter();
-	ipmroute_reset_filter();
-	ipneigh_reset_filter();
 
 	while (argc > 0) {
 		if (matches(*argv, "file") == 0) {
@@ -201,6 +199,12 @@ int do_ipmonitor(int argc, char **argv)
 			prefix_banner=1;
 		} else if (matches(*argv, "help") == 0) {
 			usage();
+		} else if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+
+			ifindex = ll_name_to_index(*argv);
+			if (!ifindex)
+				invarg("Device does not exist\n", *argv);
 		} else {
 			fprintf(stderr, "Argument \"%s\" is unknown, try \"ip monitor help\".\n", *argv);
 			exit(-1);
@@ -208,6 +212,12 @@ int do_ipmonitor(int argc, char **argv)
 		argc--;	argv++;
 	}
 
+	ipaddr_reset_filter(1, ifindex);
+	iproute_reset_filter(ifindex);
+	ipmroute_reset_filter(ifindex);
+	ipneigh_reset_filter(ifindex);
+	ipnetconf_reset_filter(ifindex);
+
 	if (llink)
 		groups |= nl_mgrp(RTNLGRP_LINK);
 	if (laddr) {
diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index be93a98..b4ed9f1 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c
@@ -174,11 +174,12 @@ int print_mroute(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	return 0;
 }
 
-void ipmroute_reset_filter(void)
+void ipmroute_reset_filter(int ifindex)
 {
 	memset(&filter, 0, sizeof(filter));
 	filter.mdst.bitlen = -1;
 	filter.msrc.bitlen = -1;
+	filter.iif = ifindex;
 }
 
 static int mroute_list(int argc, char **argv)
@@ -186,7 +187,7 @@ static int mroute_list(int argc, char **argv)
 	char *id = NULL;
 	int family;
 
-	ipmroute_reset_filter();
+	ipmroute_reset_filter(0);
 	if (preferred_family == AF_UNSPEC)
 		family = AF_INET;
 	else
diff --git a/ip/ipneigh.c b/ip/ipneigh.c
index 71a4100..6be79e1 100644
--- a/ip/ipneigh.c
+++ b/ip/ipneigh.c
@@ -313,10 +313,11 @@ int print_neigh(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	return 0;
 }
 
-void ipneigh_reset_filter(void)
+void ipneigh_reset_filter(int ifindex)
 {
 	memset(&filter, 0, sizeof(filter));
 	filter.state = ~0;
+	filter.index = ifindex;
 }
 
 static int do_show_or_flush(int argc, char **argv, int flush)
@@ -325,7 +326,7 @@ static int do_show_or_flush(int argc, char **argv, int flush)
 	int state_given = 0;
 	struct ndmsg ndm = { 0 };
 
-	ipneigh_reset_filter();
+	ipneigh_reset_filter(0);
 
 	if (!filter.family)
 		filter.family = preferred_family;
diff --git a/ip/ipnetconf.c b/ip/ipnetconf.c
index 0e44cc8..aa31ead 100644
--- a/ip/ipnetconf.c
+++ b/ip/ipnetconf.c
@@ -123,9 +123,10 @@ int print_netconf(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	return 0;
 }
 
-static void ipnetconf_reset_filter(void)
+void ipnetconf_reset_filter(int ifindex)
 {
 	memset(&filter, 0, sizeof(filter));
+	filter.ifindex = ifindex;
 }
 
 static int do_show(int argc, char **argv)
@@ -136,7 +137,7 @@ static int do_show(int argc, char **argv)
 		char			buf[1024];
 	} req;
 
-	ipnetconf_reset_filter();
+	ipnetconf_reset_filter(0);
 	filter.family = preferred_family;
 	if (filter.family == AF_UNSPEC)
 		filter.family = AF_INET;
diff --git a/ip/iproute.c b/ip/iproute.c
index d77b1e3..c9cf5d6 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1137,7 +1137,7 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action)
 	} else
 		filter_fn = print_route;
 
-	iproute_reset_filter();
+	iproute_reset_filter(0);
 	filter.tb = RT_TABLE_MAIN;
 
 	if ((action == IPROUTE_FLUSH) && argc <= 0) {
@@ -1385,7 +1385,7 @@ static int iproute_get(int argc, char **argv)
 
 	memset(&req, 0, sizeof(req));
 
-	iproute_reset_filter();
+	iproute_reset_filter(0);
 	filter.cloned = 2;
 
 	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
@@ -1590,11 +1590,14 @@ static int iproute_showdump(void)
 	exit(rtnl_from_file(stdin, &show_handler, NULL));
 }
 
-void iproute_reset_filter(void)
+void iproute_reset_filter(int ifindex)
 {
 	memset(&filter, 0, sizeof(filter));
 	filter.mdst.bitlen = -1;
 	filter.msrc.bitlen = -1;
+	filter.oif = ifindex;
+	if (filter.oif > 0)
+		filter.oifmask = -1;
 }
 
 int do_iproute(int argc, char **argv)
diff --git a/man/man8/ip-monitor.8 b/man/man8/ip-monitor.8
index b6e8d1d..68e83f1 100644
--- a/man/man8/ip-monitor.8
+++ b/man/man8/ip-monitor.8
@@ -11,6 +11,8 @@ ip-monitor, rtmon \- state monitoring
 .BR  "monitor" " [ " all " |"
 .IR OBJECT-LIST " ] ["
 .BI file " FILENAME "
+] [
+.BI dev " DEVICE "
 ]
 .sp
 
@@ -26,6 +28,8 @@ command is the first in the command line and then the object list follows:
 .BR "ip monitor" " [ " all " |"
 .IR OBJECT-LIST " ] ["
 .BI file " FILENAME "
+] [
+.BI dev " DEVICE "
 ]
 
 .I OBJECT-LIST
@@ -69,6 +73,11 @@ at any time.
 It prepends the history with the state snapshot dumped at the moment
 of starting.
 
+.P
+If the
+.BI dev
+option is given, the program prints only events related to this device.
+
 .SH SEE ALSO
 .br
 .BR ip (8)
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH net-next] Allow to set net namespace for wireless device via RTM_LINK
From: Johannes Berg @ 2014-10-20  9:47 UTC (permalink / raw)
  To: vadim4j-Re5JQEeQqe8AvxtiuMwx3w
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20141014121627.GA5115-RsE+ugaVo768flhn69MzTw@public.gmane.org>

On Tue, 2014-10-14 at 15:16 +0300, vadim4j-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org wrote:

> The reason for this was to make possible to change netns for wireless
> dev via 'ip link' too like for 'iw' util. I just think that changing
> namespace for netdev should have the generic way. May be you can suggest
> a better way

That's a respectable goal, but I think you're way overshooting it and
thus getting it wrong. You're changing the semantics from

 "please switch this interface to that other netns"

to
 "please switch this interface *and all others on this HW* to that other
netns"

which is, in my opinion, something so much more unexpected and prone to
breaking people's setups than returning "not supported" here.

johannes

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/2] xfrm6: fix a potential use after free in xfrm6_policy.c
From: Sergei Shtylyov @ 2014-10-20 10:36 UTC (permalink / raw)
  To: roy.qing.li, netdev; +Cc: steffen.klassert
In-Reply-To: <1413794954-16967-2-git-send-email-roy.qing.li@gmail.com>

Hello.

On 10/20/2014 12:49 PM, roy.qing.li@gmail.com wrote:

> From: Li RongQing <roy.qing.li@gmail.com>

> pskb_may_pull() maybe change skb->data and make nh and exthdr pointer
> oboslete, so recompute the nd and exthdr

> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
> ---
>   net/ipv6/xfrm6_policy.c |   11 ++++++++---
>   1 file changed, 8 insertions(+), 3 deletions(-)

> diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
> index ac49f84..115fd3b 100644
> --- a/net/ipv6/xfrm6_policy.c
> +++ b/net/ipv6/xfrm6_policy.c
> @@ -170,8 +170,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
>   		case IPPROTO_DCCP:
>   			if (!onlyproto && (nh + offset + 4 < skb->data ||
>   			     pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
> -				__be16 *ports = (__be16 *)exthdr;
> +				__be16 *ports;
>
> +				nh = skb_network_header(skb);
> +				ports = (__be16*)(nh + offset);

    Please insert a space between '__be16' and * like it was done in the 
deleted assignment.

[...]
> @@ -180,8 +182,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
>
>   		case IPPROTO_ICMPV6:
>   			if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
> -				u8 *icmp = (u8 *)exthdr;
> +				u8 *icmp;
>
> +				nh = skb_network_header(skb);
> +				icmp = (u8*)(nh + offset);

    Likewise.

[...]
> @@ -192,8 +196,9 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
>   		case IPPROTO_MH:
>   			if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
>   				struct ip6_mh *mh;
> -				mh = (struct ip6_mh *)exthdr;
>
> +				nh = skb_network_header(skb);
> +				mh = (struct ip6_mh*)(nh + offset);

    Likewise.

[...]

WBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next] Allow to set net namespace for wireless device via RTM_LINK
From: Marcel Holtmann @ 2014-10-20 10:46 UTC (permalink / raw)
  To: Johannes Berg
  Cc: vadim4j-Re5JQEeQqe8AvxtiuMwx3w, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1413798437.10246.12.camel-8Nb76shvtaUJvtFkdXX2HixXY32XiHfO@public.gmane.org>

Hi Johannes,

>> The reason for this was to make possible to change netns for wireless
>> dev via 'ip link' too like for 'iw' util. I just think that changing
>> namespace for netdev should have the generic way. May be you can suggest
>> a better way
> 
> That's a respectable goal, but I think you're way overshooting it and
> thus getting it wrong. You're changing the semantics from
> 
> "please switch this interface to that other netns"
> 
> to
> "please switch this interface *and all others on this HW* to that other
> netns"
> 
> which is, in my opinion, something so much more unexpected and prone to
> breaking people's setups than returning "not supported" here.

this is just me thinking out loud and by no means any recommendation on doing this. I am not even sure this is a good idea.

Maybe relaxing the check and allow ip link to move a wireless netdev into a namespace (and having the wiphy follow) could be allowed if it is the only netdev or the original wlan0 that each wiphy creates. I really do not know if this is worth it, but for some simpler container cases it could be beneficial if RTNL can be used instead of having to go through nl80211.

Regards

Marcel

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/7] netfilter: missing module license in the nf_reject_ipvX modules
From: Sergei Shtylyov @ 2014-10-20 10:49 UTC (permalink / raw)
  To: Pablo Neira Ayuso, netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1413792639-3954-2-git-send-email-pablo@netfilter.org>

Hello.

On 10/20/2014 12:10 PM, Pablo Neira Ayuso wrote:

> [   23.545204] nf_reject_ipv4: module license 'unspecified' taints kernel.

> Fixes: c8d7b98 ("netfilter: move nf_send_resetX() code to nf_reject_ipvX modules")
> Reported-by: Dave Young <dyoung@redhat.com>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>   net/ipv4/netfilter/nf_reject_ipv4.c |    3 +++
>   net/ipv6/netfilter/nf_reject_ipv6.c |    4 ++++
>   2 files changed, 7 insertions(+)

[...]
> diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
> index 5f5f043..20d9def 100644
> --- a/net/ipv6/netfilter/nf_reject_ipv6.c
> +++ b/net/ipv6/netfilter/nf_reject_ipv6.c
> @@ -5,6 +5,8 @@
>    * it under the terms of the GNU General Public License version 2 as
>    * published by the Free Software Foundation.
>    */
> +
> +#include <linux/module.h>
>   #include <net/ipv6.h>
>   #include <net/ip6_route.h>
>   #include <net/ip6_fib.h>
> @@ -161,3 +163,5 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
>   		ip6_local_out(nskb);
>   }
>   EXPORT_SYMBOL_GPL(nf_send_reset6);
> +
> +MODULE_LICENSE("GPL");

    Actually, "GPL v2" as follows from the comment above.

WBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next] Allow to set net namespace for wireless device via RTM_LINK
From: Johannes Berg @ 2014-10-20 10:52 UTC (permalink / raw)
  To: Marcel Holtmann
  Cc: vadim4j-Re5JQEeQqe8AvxtiuMwx3w, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <128188DF-5A9B-47A5-8A89-974CF7CF9064-kz+m5ild9QBg9hUCZPvPmw@public.gmane.org>

On Mon, 2014-10-20 at 12:46 +0200, Marcel Holtmann wrote:

> Maybe relaxing the check and allow ip link to move a wireless netdev
> into a namespace (and having the wiphy follow) could be allowed if it
> is the only netdev or the original wlan0 that each wiphy creates. I
> really do not know if this is worth it, but for some simpler container
> cases it could be beneficial if RTNL can be used instead of having to
> go through nl80211.

The thought crossed my mind, but

1) it's relatively complex, though by no means impossible
2) it still moves more than you bargained for, since in theory the wiphy
could be
   used to create new interfaces etc.

That said, I'm much more inclined to believe such a patch would be
worthwhile than the original.

johannes

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/7] netfilter: missing module license in the nf_reject_ipvX modules
From: Pablo Neira Ayuso @ 2014-10-20 10:54 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netfilter-devel, davem, netdev
In-Reply-To: <5444E8BF.20904@cogentembedded.com>

On Mon, Oct 20, 2014 at 02:49:35PM +0400, Sergei Shtylyov wrote:
> Hello.
> 
> On 10/20/2014 12:10 PM, Pablo Neira Ayuso wrote:
> 
> >[   23.545204] nf_reject_ipv4: module license 'unspecified' taints kernel.
> 
> >Fixes: c8d7b98 ("netfilter: move nf_send_resetX() code to nf_reject_ipvX modules")
> >Reported-by: Dave Young <dyoung@redhat.com>
> >Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> >---
> >  net/ipv4/netfilter/nf_reject_ipv4.c |    3 +++
> >  net/ipv6/netfilter/nf_reject_ipv6.c |    4 ++++
> >  2 files changed, 7 insertions(+)
> 
> [...]
> >diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
> >index 5f5f043..20d9def 100644
> >--- a/net/ipv6/netfilter/nf_reject_ipv6.c
> >+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
> >@@ -5,6 +5,8 @@
> >   * it under the terms of the GNU General Public License version 2 as
> >   * published by the Free Software Foundation.
> >   */
> >+
> >+#include <linux/module.h>
> >  #include <net/ipv6.h>
> >  #include <net/ip6_route.h>
> >  #include <net/ip6_fib.h>
> >@@ -161,3 +163,5 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
> >  		ip6_local_out(nskb);
> >  }
> >  EXPORT_SYMBOL_GPL(nf_send_reset6);
> >+
> >+MODULE_LICENSE("GPL");
> 
>    Actually, "GPL v2" as follows from the comment above.

We use that in all of the netfilter modules.

^ permalink raw reply

* [PATCH v2 0/3] net: minor gso encapsulation fixes
From: Florian Westphal @ 2014-10-20 11:49 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, therbert

The following series fixes a minor bug in the gso segmentation handlers
when encapsulation offload is used.

Theoretically this could cause kernel panic when the stack tries
to software-segment such a GRE offload packet, but it looks like there
is only one affected call site (tbf scheduler) and it handles NULL
return value.

I've included a followup patch to add IS_ERR_OR_NULL checks where needed.

While looking into this, I also found that size computation of the individual
segments is incorrect if skb->encapsulation is set.

Please see individual patches for delta vs. v1.

 core/skbuff.c                    |   13 ++++++++++---
 ipv4/af_inet.c                   |    2 +-
 ipv4/gre_offload.c               |    2 +-
 ipv4/ip_output.c                 |    2 +-
 ipv4/udp_offload.c               |    2 +-
 ipv6/ip6_offload.c               |    2 +-
 mpls/mpls_gso.c                  |    2 +-
 netfilter/nfnetlink_queue_core.c |    2 +-
 openvswitch/datapath.c           |    2 ++
 xfrm/xfrm_output.c               |    2 ++
 10 files changed, 21 insertions(+), 10 deletions(-)

^ permalink raw reply

* [PATCH v2 1/3] net: gso: use feature flag argument in all protocol gso handlers
From: Florian Westphal @ 2014-10-20 11:49 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, therbert, Florian Westphal, Pravin B Shelar
In-Reply-To: <1413805758-30026-1-git-send-email-fw@strlen.de>

skb_gso_segment() has a 'features' argument representing offload features
available to the output path.

A few handlers, e.g. GRE, instead re-fetch the features of skb->dev and use
those instead of the provided ones when handing encapsulation/tunnels.

Depending on dev->hw_enc_features of the output device skb_gso_segment() can
then return NULL even when the caller has disabled all GSO feature bits,
as segmentation of inner header thinks device will take care of segmentation.

This e.g. affects the tbf scheduler, which will silently drop GRE-encap GSO skbs
that did not fit the remaining token quota as the segmentation does not work
when device supports corresponding hw offload capabilities.

Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 No changes since v1.

 net/ipv4/af_inet.c     | 2 +-
 net/ipv4/gre_offload.c | 2 +-
 net/ipv4/udp_offload.c | 2 +-
 net/ipv6/ip6_offload.c | 2 +-
 net/mpls/mpls_gso.c    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 92db7a6..8b7fe5b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1246,7 +1246,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	encap = SKB_GSO_CB(skb)->encap_level > 0;
 	if (encap)
-		features = skb->dev->hw_enc_features & netif_skb_features(skb);
+		features &= skb->dev->hw_enc_features;
 	SKB_GSO_CB(skb)->encap_level += ihl;
 
 	skb_reset_transport_header(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ccda096..f6e345c 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb->mac_len = skb_inner_network_offset(skb);
 
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	enc_features = skb->dev->hw_enc_features & features;
 	segs = skb_mac_gso_segment(skb, enc_features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 507310e..6480cea 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb->encap_hdr_csum = 1;
 
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	enc_features = skb->dev->hw_enc_features & features;
 	segs = gso_inner_segment(skb, enc_features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 91014d3..a071563 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -90,7 +90,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	encap = SKB_GSO_CB(skb)->encap_level > 0;
 	if (encap)
-		features = skb->dev->hw_enc_features & netif_skb_features(skb);
+		features &= skb->dev->hw_enc_features;
 	SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h);
 
 	ipv6h = ipv6_hdr(skb);
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index e28ed2e..f0f5309 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -48,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 	__skb_push(skb, skb->mac_len);
 
 	/* Segment inner packet. */
-	mpls_features = skb->dev->mpls_features & netif_skb_features(skb);
+	mpls_features = skb->dev->mpls_features & features;
 	segs = skb_mac_gso_segment(skb, mpls_features);
 
 
-- 
2.0.4

^ permalink raw reply related

* [PATCH v2 2/3] net: make skb_gso_segment error handling more robust
From: Florian Westphal @ 2014-10-20 11:49 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, therbert, Florian Westphal
In-Reply-To: <1413805758-30026-1-git-send-email-fw@strlen.de>

skb_gso_segment has three possible return values:
1. a pointer to the first segmented skb
2. an errno value (IS_ERR())
3. NULL.  This can happen when GSO is used for header verification.

However, several callers currently test IS_ERR instead of IS_ERR_OR_NULL
and would oops when NULL is returned.

Note that these call sites should never actually see such a NULL return
value; all callers mask out the GSO bits in the feature argument.

However, there have been issues with some protocol handlers erronously not
respecting the specified feature mask in some cases.

It is preferable to get 'have to turn off hw offloading, else slow' reports
rather than 'kernel crashes'.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes since v1:
  - don't return PTR_ERR(NULL).
  - update changelog to clarify that segs == NULL 'should not happen'.

 net/ipv4/ip_output.c                 | 2 +-
 net/netfilter/nfnetlink_queue_core.c | 2 +-
 net/openvswitch/datapath.c           | 2 ++
 net/xfrm/xfrm_output.c               | 2 ++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 88e5ef2..bc6471d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -231,7 +231,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
 	 */
 	features = netif_skb_features(skb);
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
-	if (IS_ERR(segs)) {
+	if (IS_ERR_OR_NULL(segs)) {
 		kfree_skb(skb);
 		return -ENOMEM;
 	}
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index a82077d..7c60ccd 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -665,7 +665,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	 * returned by nf_queue.  For instance, callers rely on -ECANCELED to
 	 * mean 'ignore this hook'.
 	 */
-	if (IS_ERR(segs))
+	if (IS_ERR_OR_NULL(segs))
 		goto out_err;
 	queued = 0;
 	err = 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2e31d9e..e6d7255 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -324,6 +324,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
 	if (IS_ERR(segs))
 		return PTR_ERR(segs);
+	if (segs == NULL)
+		return -EINVAL;
 
 	/* Queue all of the segments. */
 	skb = segs;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 499d6c1..7c53285 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -157,6 +157,8 @@ static int xfrm_output_gso(struct sk_buff *skb)
 	kfree_skb(skb);
 	if (IS_ERR(segs))
 		return PTR_ERR(segs);
+	if (segs == NULL)
+		return -EINVAL;
 
 	do {
 		struct sk_buff *nskb = segs->next;
-- 
2.0.4

^ permalink raw reply related

* [PATCH v2 3/3] net: core: handle encapsulation offloads when computing segment lengths
From: Florian Westphal @ 2014-10-20 11:49 UTC (permalink / raw)
  To: netdev; +Cc: edumazet, therbert, Florian Westphal
In-Reply-To: <1413805758-30026-1-git-send-email-fw@strlen.de>

if ->encapsulation is set we have to use inner_tcp_hdrlen and add the
size of the inner network headers too.

This is 'mostly harmless'; tbf might send skb that is slightly over
quota or drop skb even if it would have fit.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 No changes since v1.

 net/core/skbuff.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 61059a0..c16615b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4070,15 +4070,22 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
 unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 {
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
+	unsigned int thlen = 0;
 
-	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
-		return tcp_hdrlen(skb) + shinfo->gso_size;
+	if (skb->encapsulation) {
+		thlen = skb_inner_transport_header(skb) -
+			skb_transport_header(skb);
 
+		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+			thlen += inner_tcp_hdrlen(skb);
+	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+		thlen = tcp_hdrlen(skb);
+	}
 	/* UFO sets gso_size to the size of the fragmentation
 	 * payload, i.e. the size of the L4 (UDP) header is already
 	 * accounted for.
 	 */
-	return shinfo->gso_size;
+	return thlen + shinfo->gso_size;
 }
 EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
 
-- 
2.0.4

^ permalink raw reply related

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Thomas Graf @ 2014-10-20 12:07 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Rusty Russell, virtualization, linux-scsi,
	linux-s390, v9fs-developer, netdev, kvm, Amit Shah, Cornelia Huck,
	Christian Borntraeger, David S. Miller, Paolo Bonzini,
	Arnd Bergmann, Greg Kroah-Hartman
In-Reply-To: <1413114332-626-14-git-send-email-mst-v4@redhat.com>

On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> virtio spec requires drivers to set DRIVER_OK before using VQs.
> This is set automatically after probe returns, virtio console violated this
> rule by adding inbufs, which causes the VQ to be used directly within
> probe.
> 
> To fix, call virtio_device_ready before using VQs.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/char/virtio_console.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index b585b47..6ebe8f6 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
>  	spin_lock_init(&port->outvq_lock);
>  	init_waitqueue_head(&port->waitqueue);
>  
> +	virtio_device_ready(portdev->vdev);
> +
>  	/* Fill the in_vq with buffers so the host can send us data. */
>  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
>  	if (!nr_added_bufs) {

Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK

    1.839658] kernel BUG at include/linux/virtio_config.h:125!
[    1.839995] invalid opcode: 0000 [#1] SMP 
[    1.840169] Modules linked in: serio_raw virtio_balloon pcspkr virtio_net virtio_console soundcore parport_pc floppy pvpanic parport i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc qxl drm_kms_helper ttm drm virtio_blk i2c_core virtio_pci ata_generic virtio_ring virtio pata_acpi
[    1.840169] CPU: 2 PID: 266 Comm: kworker/2:2 Not tainted 3.17.0+ #1
[    1.840169] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[    1.840169] Workqueue: events control_work_handler [virtio_console]
[    1.840169] task: ffff8800364bc840 ti: ffff880078490000 task.ti: ffff880078490000
[    1.840169] RIP: 0010:[<ffffffffa01d92c6>]  [<ffffffffa01d92c6>] virtio_device_ready.part.12+0x4/0x6 [virtio_console]
[    1.840169] RSP: 0018:ffff880078493c78  EFLAGS: 00010202
[    1.840169] RAX: 0000000000000007 RBX: ffff880036406200 RCX: 0000000000006e39
[    1.840169] RDX: 000000000000c0b2 RSI: 0000000000000000 RDI: 000000000001c0b2
[    1.840169] RBP: ffff880078493c78 R08: ffffffff81d2c6f8 R09: 0000000000000000
[    1.840169] R10: 0000000000000001 R11: ffff8800364bd000 R12: ffff880036618400
[    1.840169] R13: 0000000000000001 R14: ffff8800368c6800 R15: ffff880036406220
[    1.840169] FS:  0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
[    1.840169] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    1.840169] CR2: 00007f1c31c90000 CR3: 0000000001c14000 CR4: 00000000000006e0
[    1.840169] Stack:
[    1.840169]  ffff880078493ce8 ffffffffa01d886a ffff880000000001 ffffffff810e20cd
[    1.840169]  ffff880078493cb8 0000000000000282 0000000000000000 0000000087f90194
[    1.840169]  ffff880078493ce8 ffff88007ab1d4e0 ffff880036618498 ffff880036618410
[    1.840169] Call Trace:
[    1.840169]  [<ffffffffa01d886a>] add_port+0x40a/0x440 [virtio_console]
[    1.840169]  [<ffffffff810e20cd>] ? trace_hardirqs_on+0xd/0x10
[    1.840169]  [<ffffffffa01d8c6a>] control_work_handler+0x3ca/0x420 [virtio_console]
[    1.840169]  [<ffffffff810b0e7b>] ? process_one_work+0x15b/0x530
[    1.840169]  [<ffffffff810b0ef4>] process_one_work+0x1d4/0x530
[    1.840169]  [<ffffffff810b0e7b>] ? process_one_work+0x15b/0x530
[    1.840169]  [<ffffffff810b136b>] worker_thread+0x11b/0x490
[    1.840169]  [<ffffffff810b1250>] ? process_one_work+0x530/0x530
[    1.840169]  [<ffffffff810b67c3>] kthread+0xf3/0x110
[    1.840169]  [<ffffffff81788f00>] ? _raw_spin_unlock_irq+0x30/0x50
[    1.840169]  [<ffffffff810b66d0>] ? kthread_create_on_node+0x240/0x240
[    1.840169]  [<ffffffff81789a7c>] ret_from_fork+0x7c/0xb0
[    1.840169]  [<ffffffff810b66d0>] ? kthread_create_on_node+0x240/0x240
[    1.840169] Code: ff 49 89 c4 4d 85 e4 0f 8f 25 ff ff ff eb ad 48 c7 c0 f4 ff ff ff e9 17 ff ff ff e8 f5 cd eb e0 90 55 48 89 e5 0f 0b 55 48 89 e5 <0f> 0b 55 48 89 e5 0f 0b 55 48 89 e5 e8 99 e2 ff ff 48 c7 c7 c0 
[    1.840169] RIP  [<ffffffffa01d92c6>] virtio_device_ready.part.12+0x4/0x6 [virtio_console]
[    1.840169]  RSP <ffff880078493c78>

^ permalink raw reply

* Re: [PATCH stable v3.2 v3.4] ipv4: disable bh while doing route gc
From: Ben Hutchings @ 2014-10-20 12:23 UTC (permalink / raw)
  To: David Miller; +Cc: mleitner, stable, netdev, hannes
In-Reply-To: <20141020.002308.1276148768580405795.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1188 bytes --]

On Mon, 2014-10-20 at 00:23 -0400, David Miller wrote:
> From: Ben Hutchings <ben@decadent.org.uk>
> Date: Mon, 20 Oct 2014 04:09:41 +0100
> 
> > I've appplied this and the previous two patches mentioned ('ipv4: move
> > route garbage collector to work queue' and 'ipv4: avoid parallel route
> > cache gc executions').  But I didn't get the other two from you.  The
> > last batch of networking fixes I received and applied was dated
> > 2014-08-07, and the next one I've seen is dated 2014-10-11 and has
> > nothing for 3.2 or 3.4.  Did I miss one between these?
> 
> I'm at the point where I'm personally not going to go back more than
> four releases, anything more than that is rediculous.
> 
> And this time that was 3.17, 3.16, 3.14, and 3.10

OK.  I appreciate all the work you've done to backport to 3.2, but would
also have appreciated an explicit note to say you were dropping the
earlier versions.

Thanks, Ben.

-- 
Ben Hutchings
[W]e found...that it wasn't as easy to get programs right as we had thought.
... I realized that a large part of my life from then on was going to be spent
in finding mistakes in my own programs. - Maurice Wilkes, 1949

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Cornelia Huck @ 2014-10-20 12:42 UTC (permalink / raw)
  To: Thomas Graf
  Cc: linux-s390, kvm, linux-scsi, Michael S. Tsirkin, netdev,
	linux-kernel, virtualization, Christian Borntraeger,
	Greg Kroah-Hartman, Arnd Bergmann, Paolo Bonzini, Amit Shah,
	v9fs-developer, David S. Miller
In-Reply-To: <20141020120750.GA18561@casper.infradead.org>

On Mon, 20 Oct 2014 13:07:50 +0100
Thomas Graf <tgraf@suug.ch> wrote:

> On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > This is set automatically after probe returns, virtio console violated this
> > rule by adding inbufs, which causes the VQ to be used directly within
> > probe.
> > 
> > To fix, call virtio_device_ready before using VQs.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  drivers/char/virtio_console.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > index b585b47..6ebe8f6 100644
> > --- a/drivers/char/virtio_console.c
> > +++ b/drivers/char/virtio_console.c
> > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> >  	spin_lock_init(&port->outvq_lock);
> >  	init_waitqueue_head(&port->waitqueue);
> >  
> > +	virtio_device_ready(portdev->vdev);
> > +
> >  	/* Fill the in_vq with buffers so the host can send us data. */
> >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> >  	if (!nr_added_bufs) {
> 
> Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK

I think we need to set this in the probe function instead, otherwise we
fail for multiqueue (which also wants to use the control queue early).

Completely untested patch below; I can send this with proper s-o-b if
it helps.

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index bfa6400..cf7a561 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1449,8 +1449,6 @@ static int add_port(struct ports_device *portdev, u32 id)
 	spin_lock_init(&port->outvq_lock);
 	init_waitqueue_head(&port->waitqueue);
 
-	virtio_device_ready(portdev->vdev);
-
 	/* Fill the in_vq with buffers so the host can send us data. */
 	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
 	if (!nr_added_bufs) {
@@ -2026,6 +2024,8 @@ static int virtcons_probe(struct virtio_device *vdev)
 	spin_lock_init(&portdev->ports_lock);
 	INIT_LIST_HEAD(&portdev->ports);
 
+	virtio_device_ready(portdev->vdev);
+
 	if (multiport) {
 		unsigned int nr_added_bufs;

^ permalink raw reply related

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Michael S. Tsirkin @ 2014-10-20 13:10 UTC (permalink / raw)
  To: Thomas Graf
  Cc: linux-s390, kvm, linux-scsi, Christian Borntraeger, netdev,
	linux-kernel, virtualization, Greg Kroah-Hartman, Arnd Bergmann,
	Paolo Bonzini, Amit Shah, v9fs-developer, David S. Miller
In-Reply-To: <20141020120750.GA18561@casper.infradead.org>

On Mon, Oct 20, 2014 at 01:07:50PM +0100, Thomas Graf wrote:
> On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > This is set automatically after probe returns, virtio console violated this
> > rule by adding inbufs, which causes the VQ to be used directly within
> > probe.
> > 
> > To fix, call virtio_device_ready before using VQs.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  drivers/char/virtio_console.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > index b585b47..6ebe8f6 100644
> > --- a/drivers/char/virtio_console.c
> > +++ b/drivers/char/virtio_console.c
> > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> >  	spin_lock_init(&port->outvq_lock);
> >  	init_waitqueue_head(&port->waitqueue);
> >  
> > +	virtio_device_ready(portdev->vdev);
> > +
> >  	/* Fill the in_vq with buffers so the host can send us data. */
> >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> >  	if (!nr_added_bufs) {

I see Cornelia sent a patch already.
I'd like to reproduce this though - could you send me
the command line please?


> Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK
> 
>     1.839658] kernel BUG at include/linux/virtio_config.h:125!
> [    1.839995] invalid opcode: 0000 [#1] SMP 
> [    1.840169] Modules linked in: serio_raw virtio_balloon pcspkr virtio_net virtio_console soundcore parport_pc floppy pvpanic parport i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc qxl drm_kms_helper ttm drm virtio_blk i2c_core virtio_pci ata_generic virtio_ring virtio pata_acpi
> [    1.840169] CPU: 2 PID: 266 Comm: kworker/2:2 Not tainted 3.17.0+ #1
> [    1.840169] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [    1.840169] Workqueue: events control_work_handler [virtio_console]
> [    1.840169] task: ffff8800364bc840 ti: ffff880078490000 task.ti: ffff880078490000
> [    1.840169] RIP: 0010:[<ffffffffa01d92c6>]  [<ffffffffa01d92c6>] virtio_device_ready.part.12+0x4/0x6 [virtio_console]
> [    1.840169] RSP: 0018:ffff880078493c78  EFLAGS: 00010202
> [    1.840169] RAX: 0000000000000007 RBX: ffff880036406200 RCX: 0000000000006e39
> [    1.840169] RDX: 000000000000c0b2 RSI: 0000000000000000 RDI: 000000000001c0b2
> [    1.840169] RBP: ffff880078493c78 R08: ffffffff81d2c6f8 R09: 0000000000000000
> [    1.840169] R10: 0000000000000001 R11: ffff8800364bd000 R12: ffff880036618400
> [    1.840169] R13: 0000000000000001 R14: ffff8800368c6800 R15: ffff880036406220
> [    1.840169] FS:  0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
> [    1.840169] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [    1.840169] CR2: 00007f1c31c90000 CR3: 0000000001c14000 CR4: 00000000000006e0
> [    1.840169] Stack:
> [    1.840169]  ffff880078493ce8 ffffffffa01d886a ffff880000000001 ffffffff810e20cd
> [    1.840169]  ffff880078493cb8 0000000000000282 0000000000000000 0000000087f90194
> [    1.840169]  ffff880078493ce8 ffff88007ab1d4e0 ffff880036618498 ffff880036618410
> [    1.840169] Call Trace:
> [    1.840169]  [<ffffffffa01d886a>] add_port+0x40a/0x440 [virtio_console]
> [    1.840169]  [<ffffffff810e20cd>] ? trace_hardirqs_on+0xd/0x10
> [    1.840169]  [<ffffffffa01d8c6a>] control_work_handler+0x3ca/0x420 [virtio_console]
> [    1.840169]  [<ffffffff810b0e7b>] ? process_one_work+0x15b/0x530
> [    1.840169]  [<ffffffff810b0ef4>] process_one_work+0x1d4/0x530
> [    1.840169]  [<ffffffff810b0e7b>] ? process_one_work+0x15b/0x530
> [    1.840169]  [<ffffffff810b136b>] worker_thread+0x11b/0x490
> [    1.840169]  [<ffffffff810b1250>] ? process_one_work+0x530/0x530
> [    1.840169]  [<ffffffff810b67c3>] kthread+0xf3/0x110
> [    1.840169]  [<ffffffff81788f00>] ? _raw_spin_unlock_irq+0x30/0x50
> [    1.840169]  [<ffffffff810b66d0>] ? kthread_create_on_node+0x240/0x240
> [    1.840169]  [<ffffffff81789a7c>] ret_from_fork+0x7c/0xb0
> [    1.840169]  [<ffffffff810b66d0>] ? kthread_create_on_node+0x240/0x240
> [    1.840169] Code: ff 49 89 c4 4d 85 e4 0f 8f 25 ff ff ff eb ad 48 c7 c0 f4 ff ff ff e9 17 ff ff ff e8 f5 cd eb e0 90 55 48 89 e5 0f 0b 55 48 89 e5 <0f> 0b 55 48 89 e5 0f 0b 55 48 89 e5 e8 99 e2 ff ff 48 c7 c7 c0 
> [    1.840169] RIP  [<ffffffffa01d92c6>] virtio_device_ready.part.12+0x4/0x6 [virtio_console]
> [    1.840169]  RSP <ffff880078493c78>

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Thomas Graf @ 2014-10-20 13:10 UTC (permalink / raw)
  To: Cornelia Huck
  Cc: Michael S. Tsirkin, linux-kernel, Rusty Russell, virtualization,
	linux-scsi, linux-s390, v9fs-developer, netdev, kvm, Amit Shah,
	Christian Borntraeger, David S. Miller, Paolo Bonzini,
	Arnd Bergmann, Greg Kroah-Hartman
In-Reply-To: <20141020144223.5655dcbc.cornelia.huck@de.ibm.com>

On 10/20/14 at 02:42pm, Cornelia Huck wrote:
> On Mon, 20 Oct 2014 13:07:50 +0100
> Thomas Graf <tgraf@suug.ch> wrote:
> 
> > On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > > This is set automatically after probe returns, virtio console violated this
> > > rule by adding inbufs, which causes the VQ to be used directly within
> > > probe.
> > > 
> > > To fix, call virtio_device_ready before using VQs.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  drivers/char/virtio_console.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > > index b585b47..6ebe8f6 100644
> > > --- a/drivers/char/virtio_console.c
> > > +++ b/drivers/char/virtio_console.c
> > > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> > >  	spin_lock_init(&port->outvq_lock);
> > >  	init_waitqueue_head(&port->waitqueue);
> > >  
> > > +	virtio_device_ready(portdev->vdev);
> > > +
> > >  	/* Fill the in_vq with buffers so the host can send us data. */
> > >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> > >  	if (!nr_added_bufs) {
> > 
> > Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK
> 
> I think we need to set this in the probe function instead, otherwise we
> fail for multiqueue (which also wants to use the control queue early).
> 
> Completely untested patch below; I can send this with proper s-o-b if
> it helps.

virtio_dev_probe() already sets DRIVER_OK if ->probe() returned
without an error. I assume Michael added it to add_port() because
probing doesn't always occur first. Can we just remove the BUG_ON()?

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Thomas Graf @ 2014-10-20 13:12 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Rusty Russell, virtualization, linux-scsi,
	linux-s390, v9fs-developer, netdev, kvm, Amit Shah, Cornelia Huck,
	Christian Borntraeger, David S. Miller, Paolo Bonzini,
	Arnd Bergmann, Greg Kroah-Hartman
In-Reply-To: <20141020131016.GA20489@redhat.com>

On 10/20/14 at 04:10pm, Michael S. Tsirkin wrote:
> On Mon, Oct 20, 2014 at 01:07:50PM +0100, Thomas Graf wrote:
> > On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > > This is set automatically after probe returns, virtio console violated this
> > > rule by adding inbufs, which causes the VQ to be used directly within
> > > probe.
> > > 
> > > To fix, call virtio_device_ready before using VQs.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  drivers/char/virtio_console.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > > index b585b47..6ebe8f6 100644
> > > --- a/drivers/char/virtio_console.c
> > > +++ b/drivers/char/virtio_console.c
> > > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> > >  	spin_lock_init(&port->outvq_lock);
> > >  	init_waitqueue_head(&port->waitqueue);
> > >  
> > > +	virtio_device_ready(portdev->vdev);
> > > +
> > >  	/* Fill the in_vq with buffers so the host can send us data. */
> > >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> > >  	if (!nr_added_bufs) {
> 
> I see Cornelia sent a patch already.
> I'd like to reproduce this though - could you send me
> the command line please?

1. Invoke qemu:

/usr/bin/qemu-system-x86_64 -machine accel=kvm -name f20-2 -S -machine
pc-i440fx-1.4,accel=kvm,usb=off -m 2048 -realtime mlock=off -smp
4,sockets=4,cores=1,threads=1 -uuid
dd45ec4c-7c26-4b73-9b6b-81f2912c5235 -no-user-config -nodefaults
-chardev
socket,id=charmonitor,path=/var/lib/libvirt/qemu/f20-2.monitor,server,nowait
-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc
-no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2
-device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 -drive
file=/virt/f20n2-clone,if=none,id=drive-virtio-disk0,format=raw
-device
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
-netdev tap,fd=24,id=hostnet0,vhost=on,vhostfd=25 -device
virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:ef:72:4e,bus=pci.0,addr=0x3
-chardev pty,id=charserial0 -device
isa-serial,chardev=charserial0,id=serial0 -chardev
spicevmc,id=charchannel0,name=vdagent -device
virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=com.redhat.spice.0
-device usb-tablet,id=input0 -spice
port=5900,addr=127.0.0.1,disable-ticketing,seamless-migration=on
-device
qxl-vga,id=video0,ram_size=67108864,vram_size=67108864,bus=pci.0,addr=0x2
-device intel-hda,id=sound0,bus=pci.0,addr=0x4 -device
hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7

2. Attach console right away: virsh console f20-2

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Michael S. Tsirkin @ 2014-10-20 13:14 UTC (permalink / raw)
  To: Thomas Graf
  Cc: linux-s390, kvm, linux-scsi, Christian Borntraeger, netdev,
	linux-kernel, virtualization, Greg Kroah-Hartman, Arnd Bergmann,
	Paolo Bonzini, Amit Shah, v9fs-developer, David S. Miller
In-Reply-To: <20141020131016.GA20489@redhat.com>

On Mon, Oct 20, 2014 at 04:10:16PM +0300, Michael S. Tsirkin wrote:
> On Mon, Oct 20, 2014 at 01:07:50PM +0100, Thomas Graf wrote:
> > On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > > This is set automatically after probe returns, virtio console violated this
> > > rule by adding inbufs, which causes the VQ to be used directly within
> > > probe.
> > > 
> > > To fix, call virtio_device_ready before using VQs.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  drivers/char/virtio_console.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > > index b585b47..6ebe8f6 100644
> > > --- a/drivers/char/virtio_console.c
> > > +++ b/drivers/char/virtio_console.c
> > > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> > >  	spin_lock_init(&port->outvq_lock);
> > >  	init_waitqueue_head(&port->waitqueue);
> > >  
> > > +	virtio_device_ready(portdev->vdev);
> > > +
> > >  	/* Fill the in_vq with buffers so the host can send us data. */
> > >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> > >  	if (!nr_added_bufs) {
> 
> I see Cornelia sent a patch already.
> I'd like to reproduce this though - could you send me
> the command line please?

Nevermind, the trick is to add a port it seems:

-device virtio-serial -chardev socket,path=/tmp/c1,server,nowait,id=foo
-device virtserialport,chardev=foo,name=org.fedoraproject.port.0

works fine without -device virtserialport.

-- 
MST

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Michael S. Tsirkin @ 2014-10-20 13:35 UTC (permalink / raw)
  To: Cornelia Huck
  Cc: linux-s390, kvm, linux-scsi, Christian Borntraeger, netdev,
	linux-kernel, virtualization, Amit Shah, Greg Kroah-Hartman,
	Arnd Bergmann, Paolo Bonzini, Thomas Graf, v9fs-developer,
	David S. Miller
In-Reply-To: <20141020144223.5655dcbc.cornelia.huck@de.ibm.com>

On Mon, Oct 20, 2014 at 02:42:23PM +0200, Cornelia Huck wrote:
> On Mon, 20 Oct 2014 13:07:50 +0100
> Thomas Graf <tgraf@suug.ch> wrote:
> 
> > On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > > This is set automatically after probe returns, virtio console violated this
> > > rule by adding inbufs, which causes the VQ to be used directly within
> > > probe.
> > > 
> > > To fix, call virtio_device_ready before using VQs.
> > > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  drivers/char/virtio_console.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > > index b585b47..6ebe8f6 100644
> > > --- a/drivers/char/virtio_console.c
> > > +++ b/drivers/char/virtio_console.c
> > > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> > >  	spin_lock_init(&port->outvq_lock);
> > >  	init_waitqueue_head(&port->waitqueue);
> > >  
> > > +	virtio_device_ready(portdev->vdev);
> > > +
> > >  	/* Fill the in_vq with buffers so the host can send us data. */
> > >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> > >  	if (!nr_added_bufs) {
> > 
> > Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK
> 
> I think we need to set this in the probe function instead, otherwise we
> fail for multiqueue (which also wants to use the control queue early).
> 
> Completely untested patch below; I can send this with proper s-o-b if
> it helps.
> 
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index bfa6400..cf7a561 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -1449,8 +1449,6 @@ static int add_port(struct ports_device *portdev, u32 id)
>  	spin_lock_init(&port->outvq_lock);
>  	init_waitqueue_head(&port->waitqueue);
>  
> -	virtio_device_ready(portdev->vdev);
> -
>  	/* Fill the in_vq with buffers so the host can send us data. */
>  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
>  	if (!nr_added_bufs) {
> @@ -2026,6 +2024,8 @@ static int virtcons_probe(struct virtio_device *vdev)
>  	spin_lock_init(&portdev->ports_lock);
>  	INIT_LIST_HEAD(&portdev->ports);
>  
> +	virtio_device_ready(portdev->vdev);
> +
>  	if (multiport) {
>  		unsigned int nr_added_bufs;
>  

I wanted to set DRIVER_OK as late as possible, to both reduce
the chance it can fail after DRIVER_OK and to reduce  the risk of
introducing a regression since old qemu might only start sending
interrupts after DRIVER_OK is set.

So I wanted everything completely initialized before DRIVER_OK.

You patch makes sense to me since nothing can fail,
and we won't get interrupts before we add inbufs.

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>

Testig will report shortly, pls send with sob line.

-- 
MST

^ permalink raw reply

* Re: Queue with wait-free enqueue, blocking dequeue, splice
From: Jesper Dangaard Brouer @ 2014-10-20 14:02 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: brouer, Paul E. McKenney, netdev, Jamal Hadi Salim
In-Reply-To: <412768308.11171.1413632892841.JavaMail.zimbra@efficios.com>


On Sat, 18 Oct 2014 11:48:12 +0000 (UTC) Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> Following our LPC discussion on lock-free queue algorithms
> for qdisc, here is some info on the wfcqueue implementation
> found in Userspace RCU. See http://urcu.so for info and
> git repository.

Thank for following up on our very interesting discussions.

I've started with the more simple variant "urcu/static/wfqueue.h" to
understand the concepts.  And I'm now reading wfcqueue.h, which I guess
it replacing wfqueue.h.

 
> Here is the wfcqueue ported to the Linux kernel I sent last
> year as RFC:
> https://lkml.org/lkml/2013/3/14/289
> 
> I'm very interested to learn if it fits well for your
> use-case,

Does this wfcqueue API support bulk dequeue?  (A feature needed for the
lock-less qdisc implementation, else it cannot compete with our new
bulk dequeue strategy).

AFAIK your queue implementation is a CAS-based, Wait-Free on enqueue,
but Lock-Free on dequeue with the potential for waiting/blocking on
a enqueue processes.
 I'm not 100% sure, that we want this behavior for the qdisc system.

I can certainly use the wfcq_empty() check, but I guess I need to
maintain a separate counter to maintain the qdisc limit, right?
(I would use the approximate/split counter API percpu_counter to keep
this scalable, and wfcq_empty() would provide an accurate empty check)


I think, we/I should start micro benchmarking the different approaches.
As our time budget is only 67.2ns 
 http://netoptimizer.blogspot.dk/2014/05/the-calculations-10gbits-wirespeed.html
(or bulking tricks artificially "increase" this budget)


The motivation behind this lockless qdisc is, the current qdisc locking
cost 48ns, see slide 9/16 titled "Qdisc locking is nasty":
 http://people.netfilter.org/hawk/presentations/LinuxPlumbers2014/performance_tx_qdisc_bulk_LPC2014.pdf

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Michael S. Tsirkin @ 2014-10-20 14:04 UTC (permalink / raw)
  To: Cornelia Huck
  Cc: linux-s390, kvm, linux-scsi, Christian Borntraeger, netdev,
	linux-kernel, virtualization, Amit Shah, Greg Kroah-Hartman,
	Arnd Bergmann, Paolo Bonzini, Thomas Graf, v9fs-developer,
	David S. Miller
In-Reply-To: <20141020133555.GA27111@redhat.com>

On Mon, Oct 20, 2014 at 04:35:55PM +0300, Michael S. Tsirkin wrote:
> On Mon, Oct 20, 2014 at 02:42:23PM +0200, Cornelia Huck wrote:
> > On Mon, 20 Oct 2014 13:07:50 +0100
> > Thomas Graf <tgraf@suug.ch> wrote:
> > 
> > > On 10/13/14 at 10:50am, Michael S. Tsirkin wrote:
> > > > virtio spec requires drivers to set DRIVER_OK before using VQs.
> > > > This is set automatically after probe returns, virtio console violated this
> > > > rule by adding inbufs, which causes the VQ to be used directly within
> > > > probe.
> > > > 
> > > > To fix, call virtio_device_ready before using VQs.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > ---
> > > >  drivers/char/virtio_console.c | 2 ++
> > > >  1 file changed, 2 insertions(+)
> > > > 
> > > > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > > > index b585b47..6ebe8f6 100644
> > > > --- a/drivers/char/virtio_console.c
> > > > +++ b/drivers/char/virtio_console.c
> > > > @@ -1449,6 +1449,8 @@ static int add_port(struct ports_device *portdev, u32 id)
> > > >  	spin_lock_init(&port->outvq_lock);
> > > >  	init_waitqueue_head(&port->waitqueue);
> > > >  
> > > > +	virtio_device_ready(portdev->vdev);
> > > > +
> > > >  	/* Fill the in_vq with buffers so the host can send us data. */
> > > >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> > > >  	if (!nr_added_bufs) {
> > > 
> > > Seems like probe and add_port() now both set VIRTIO_CONFIG_S_DRIVER_OK
> > 
> > I think we need to set this in the probe function instead, otherwise we
> > fail for multiqueue (which also wants to use the control queue early).
> > 
> > Completely untested patch below; I can send this with proper s-o-b if
> > it helps.
> > 
> > diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> > index bfa6400..cf7a561 100644
> > --- a/drivers/char/virtio_console.c
> > +++ b/drivers/char/virtio_console.c
> > @@ -1449,8 +1449,6 @@ static int add_port(struct ports_device *portdev, u32 id)
> >  	spin_lock_init(&port->outvq_lock);
> >  	init_waitqueue_head(&port->waitqueue);
> >  
> > -	virtio_device_ready(portdev->vdev);
> > -
> >  	/* Fill the in_vq with buffers so the host can send us data. */
> >  	nr_added_bufs = fill_queue(port->in_vq, &port->inbuf_lock);
> >  	if (!nr_added_bufs) {
> > @@ -2026,6 +2024,8 @@ static int virtcons_probe(struct virtio_device *vdev)
> >  	spin_lock_init(&portdev->ports_lock);
> >  	INIT_LIST_HEAD(&portdev->ports);
> >  
> > +	virtio_device_ready(portdev->vdev);
> > +
> >  	if (multiport) {
> >  		unsigned int nr_added_bufs;
> >  
> 
> I wanted to set DRIVER_OK as late as possible, to both reduce
> the chance it can fail after DRIVER_OK and to reduce  the risk of
> introducing a regression since old qemu might only start sending
> interrupts after DRIVER_OK is set.
> 
> So I wanted everything completely initialized before DRIVER_OK.
> 
> You patch makes sense to me since nothing can fail,
> and we won't get interrupts before we add inbufs.
> 
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Testig will report shortly, pls send with sob line.

Sure enough, this helps:

Tested-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>

Pls repost as a top-level patch.

> -- 
> MST

^ permalink raw reply

* Re: [PATCH RFT 0/8] Marvell PXA168 libphy handling and Berlin Ethernet
From: Sebastian Hesselbarth @ 2014-10-20 14:37 UTC (permalink / raw)
  To: Antoine Tenart
  Cc: David S. Miller, Florian Fainelli, Eric Miao, Haojian Zhuang,
	linux-arm-kernel, netdev, devicetree, linux-kernel
In-Reply-To: <20141016095323.GA1061@kwain>

On 16.10.2014 11:53, Antoine Tenart wrote:
> On Thu, Oct 09, 2014 at 02:38:58PM +0200, Sebastian Hesselbarth wrote:
>> This patch series deals with a removing a IP feature that can be found
>> on all currently supported Marvell Ethernet IP (pxa168_eth, mv643xx_eth,
>> mvneta). The MAC IP allows to automatically perform PHY auto-negotiation
>> without software interaction.
>>
>> However, this feature (a) fundamentally clashes with the way libphy works
>> and (b) is unable to deal with quirky PHYs that require special treatment.
>> In this series, pxa168_eth driver is rewritten to completely disable that
>> feature and properly deal with libphy provided PHYs. The other two drivers
>> are suspect to future patch sets, also removing the code related with it.
>>
>> Currently, the patches are based on next-20141009 and will be resent once
>> v3.18-rc1 drops. This is a Request-For-Test on both BG2Q and MMP/gplug as
>
> I tested the series on a BG2Q, it worked well.

Antoine,

Thanks for testing! I assume you have added a phy-connection-type
property to BG2Q's ethernet node?

I doubt there will be any Tested-by from MMP guys anytime soon, so
I'll resend this with the minor remarks to be merged for 3.19.

Sebastian

^ permalink raw reply

* Re: [PATCH v4 13/25] virtio_console: enable VQs early
From: Thomas Graf @ 2014-10-20 14:44 UTC (permalink / raw)
  To: Michael S. Tsirkin, Cornelia Huck
  Cc: wlinux-kernel, Rusty Russell, virtualization, linux-scsi,
	linux-s390, v9fs-developer, netdev, kvm, Amit Shah,
	Christian Borntraeger, David S. Miller, Paolo Bonzini,
	Arnd Bergmann, Greg Kroah-Hartman
In-Reply-To: <20141020140410.GA10994@redhat.com>

On 10/20/14 at 05:04pm, Michael S. Tsirkin wrote:
> Sure enough, this helps:
> 
> Tested-by: Michael S. Tsirkin <mst@redhat.com>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Pls repost as a top-level patch.

Thanks for fixing this so promptly.

^ permalink raw reply

* Re: [PATCH RFT 0/8] Marvell PXA168 libphy handling and Berlin Ethernet
From: Antoine Tenart @ 2014-10-20 15:10 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: Antoine Tenart, David S. Miller, Florian Fainelli, Eric Miao,
	Haojian Zhuang, linux-arm-kernel, netdev, devicetree,
	linux-kernel
In-Reply-To: <54451E3F.1030700@gmail.com>

Sebastian,

On Mon, Oct 20, 2014 at 04:37:51PM +0200, Sebastian Hesselbarth wrote:
> On 16.10.2014 11:53, Antoine Tenart wrote:
> >On Thu, Oct 09, 2014 at 02:38:58PM +0200, Sebastian Hesselbarth wrote:
> >>This patch series deals with a removing a IP feature that can be found
> >>on all currently supported Marvell Ethernet IP (pxa168_eth, mv643xx_eth,
> >>mvneta). The MAC IP allows to automatically perform PHY auto-negotiation
> >>without software interaction.
> >>
> >>However, this feature (a) fundamentally clashes with the way libphy works
> >>and (b) is unable to deal with quirky PHYs that require special treatment.
> >>In this series, pxa168_eth driver is rewritten to completely disable that
> >>feature and properly deal with libphy provided PHYs. The other two drivers
> >>are suspect to future patch sets, also removing the code related with it.
> >>
> >>Currently, the patches are based on next-20141009 and will be resent once
> >>v3.18-rc1 drops. This is a Request-For-Test on both BG2Q and MMP/gplug as
> >
> >I tested the series on a BG2Q, it worked well.
> 
> Thanks for testing! I assume you have added a phy-connection-type
> property to BG2Q's ethernet node?

Yes, I added the following property to the ethernet-phy node:

	phy-connection-type = "mii";

Feel free to add this to your series, or I can also send a patch.


Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox