Netdev List
 help / color / mirror / Atom feed
* [PATCH v3] iproute2: add support for tcp_metrics
From: Julian Anastasov @ 2012-10-03 22:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

	ip tcp_metrics/tcpmetrics

	We support get/del for single entry and dump for
show/flush.

v3:
 - fix rtt/rttvar shifts as suggested by Eric Dumazet
 - show rtt/rttvar usecs as suggested by David Laight

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---

	Stephen, I see correct values for rtt/rttvar,
tcp_metric_set_msecs keeps them into u32 as msecs, so there is
no problem with HZ value.

v2: put family in req.n.nlmsg_type

 include/linux/tcp_metrics.h |   54 ++++++
 ip/Makefile                 |    2 +-
 ip/ip.c                     |    4 +-
 ip/ip_common.h              |    1 +
 ip/tcp_metrics.c            |  429 +++++++++++++++++++++++++++++++++++++++++++
 man/man8/Makefile           |    3 +-
 man/man8/ip-tcp_metrics.8   |  143 ++++++++++++++
 man/man8/ip.8               |    7 +-
 8 files changed, 639 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/tcp_metrics.h
 create mode 100644 ip/tcp_metrics.c
 create mode 100644 man/man8/ip-tcp_metrics.8

diff --git a/include/linux/tcp_metrics.h b/include/linux/tcp_metrics.h
new file mode 100644
index 0000000..cb5157b
--- /dev/null
+++ b/include/linux/tcp_metrics.h
@@ -0,0 +1,54 @@
+/* tcp_metrics.h - TCP Metrics Interface */
+
+#ifndef _LINUX_TCP_METRICS_H
+#define _LINUX_TCP_METRICS_H
+
+#include <linux/types.h>
+
+/* NETLINK_GENERIC related info
+ */
+#define TCP_METRICS_GENL_NAME		"tcp_metrics"
+#define TCP_METRICS_GENL_VERSION	0x1
+
+enum tcp_metric_index {
+	TCP_METRIC_RTT,
+	TCP_METRIC_RTTVAR,
+	TCP_METRIC_SSTHRESH,
+	TCP_METRIC_CWND,
+	TCP_METRIC_REORDERING,
+
+	/* Always last.  */
+	__TCP_METRIC_MAX,
+};
+
+#define TCP_METRIC_MAX	(__TCP_METRIC_MAX - 1)
+
+enum {
+	TCP_METRICS_ATTR_UNSPEC,
+	TCP_METRICS_ATTR_ADDR_IPV4,		/* u32 */
+	TCP_METRICS_ATTR_ADDR_IPV6,		/* binary */
+	TCP_METRICS_ATTR_AGE,			/* msecs */
+	TCP_METRICS_ATTR_TW_TSVAL,		/* u32, raw, rcv tsval */
+	TCP_METRICS_ATTR_TW_TS_STAMP,		/* s32, sec age */
+	TCP_METRICS_ATTR_VALS,			/* nested +1, u32 */
+	TCP_METRICS_ATTR_FOPEN_MSS,		/* u16 */
+	TCP_METRICS_ATTR_FOPEN_SYN_DROPS,	/* u16, count of drops */
+	TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,	/* msecs age */
+	TCP_METRICS_ATTR_FOPEN_COOKIE,		/* binary */
+
+	__TCP_METRICS_ATTR_MAX,
+};
+
+#define TCP_METRICS_ATTR_MAX	(__TCP_METRICS_ATTR_MAX - 1)
+
+enum {
+	TCP_METRICS_CMD_UNSPEC,
+	TCP_METRICS_CMD_GET,
+	TCP_METRICS_CMD_DEL,
+
+	__TCP_METRICS_CMD_MAX,
+};
+
+#define TCP_METRICS_CMD_MAX	(__TCP_METRICS_CMD_MAX - 1)
+
+#endif /* _LINUX_TCP_METRICS_H */
diff --git a/ip/Makefile b/ip/Makefile
index 3bc1516..dfe2e71 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -4,7 +4,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
-    iplink_vxlan.o
+    iplink_vxlan.o tcp_metrics.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index df06d3e..e0f7e60 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -45,7 +45,7 @@ static void usage(void)
 "       ip [ -force ] -batch filename\n"
 "where  OBJECT := { link | addr | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddr | mroute | mrule | monitor | xfrm |\n"
-"                   netns | l2tp }\n"
+"                   netns | l2tp | tcp_metrics }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | bridge | link } |\n"
 "                    -l[oops] { maximum-addr-flush-attempts } |\n"
@@ -78,6 +78,8 @@ static const struct cmd {
 	{ "tunl",	do_iptunnel },
 	{ "tuntap",	do_iptuntap },
 	{ "tap",	do_iptuntap },
+	{ "tcpmetrics",	do_tcp_metrics },
+	{ "tcp_metrics",do_tcp_metrics },
 	{ "monitor",	do_ipmonitor },
 	{ "xfrm",	do_xfrm },
 	{ "mroute",	do_multiroute },
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 5fa2cc0..2fd66b7 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -42,6 +42,7 @@ extern int do_multirule(int argc, char **argv);
 extern int do_netns(int argc, char **argv);
 extern int do_xfrm(int argc, char **argv);
 extern int do_ipl2tp(int argc, char **argv);
+extern int do_tcp_metrics(int argc, char **argv);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
 {
diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
new file mode 100644
index 0000000..34e1d8e
--- /dev/null
+++ b/ip/tcp_metrics.c
@@ -0,0 +1,429 @@
+/*
+ * tcp_metrics.c	"ip tcp_metrics/tcpmetrics"
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, August 2012
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <linux/if.h>
+
+#include <linux/genetlink.h>
+#include <linux/tcp_metrics.h>
+
+#include "utils.h"
+#include "ip_common.h"
+#include "libgenl.h"
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip tcp_metrics/tcpmetrics { COMMAND | help }\n");
+	fprintf(stderr, "       ip tcp_metrics { show | flush } SELECTOR\n");
+	fprintf(stderr, "       ip tcp_metrics delete [ address ] ADDRESS\n");
+	fprintf(stderr, "SELECTOR := [ [ address ] PREFIX ]\n");
+	exit(-1);
+}
+
+/* netlink socket */
+static struct rtnl_handle grth = { .fd = -1 };
+static int genl_family = -1;
+
+#define TCPM_REQUEST(_req, _bufsiz, _cmd, _flags) \
+	GENL_REQUEST(_req, _bufsiz, genl_family, 0, \
+		     TCP_METRICS_GENL_VERSION, _cmd, _flags)
+
+#define CMD_LIST	0x0001	/* list, lst, show		*/
+#define CMD_DEL		0x0002	/* delete, remove		*/
+#define CMD_FLUSH	0x0004	/* flush			*/
+
+static struct {
+	char	*name;
+	int	code;
+} cmds[] = {
+	{	"list",		CMD_LIST	},
+	{	"lst",		CMD_LIST	},
+	{	"show",		CMD_LIST	},
+	{	"delete",	CMD_DEL		},
+	{	"remove",	CMD_DEL		},
+	{	"flush",	CMD_FLUSH	},
+};
+
+static char *metric_name[TCP_METRIC_MAX + 1] = {
+	[TCP_METRIC_RTT]		= "rtt",
+	[TCP_METRIC_RTTVAR]		= "rttvar",
+	[TCP_METRIC_SSTHRESH]		= "ssthresh",
+	[TCP_METRIC_CWND]		= "cwnd",
+	[TCP_METRIC_REORDERING]		= "reordering",
+};
+
+static struct
+{
+	int flushed;
+	char *flushb;
+	int flushp;
+	int flushe;
+	int cmd;
+	inet_prefix addr;
+} f;
+
+static int flush_update(void)
+{
+	if (rtnl_send_check(&grth, f.flushb, f.flushp) < 0) {
+		perror("Failed to send flush request\n");
+		return -1;
+	}
+	f.flushp = 0;
+	return 0;
+}
+
+static int process_msg(const struct sockaddr_nl *who, struct nlmsghdr *n,
+		       void *arg)
+{
+	FILE *fp = (FILE *) arg;
+	struct genlmsghdr *ghdr;
+	struct rtattr *attrs[TCP_METRICS_ATTR_MAX + 1], *a;
+	int len = n->nlmsg_len;
+	char abuf[256];
+	inet_prefix addr;
+	int family, i, atype;
+
+	if (n->nlmsg_type != genl_family)
+		return -1;
+
+	len -= NLMSG_LENGTH(GENL_HDRLEN);
+	if (len < 0)
+		return -1;
+
+	ghdr = NLMSG_DATA(n);
+	if (ghdr->cmd != TCP_METRICS_CMD_GET)
+		return 0;
+
+	parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
+		     len);
+
+	a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
+	if (a) {
+		if (f.addr.family && f.addr.family != AF_INET)
+			return 0;
+		memcpy(&addr.data, RTA_DATA(a), 4);
+		addr.bytelen = 4;
+		family = AF_INET;
+		atype = TCP_METRICS_ATTR_ADDR_IPV4;
+	} else {
+		a = attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+		if (a) {
+			if (f.addr.family && f.addr.family != AF_INET6)
+				return 0;
+			memcpy(&addr.data, RTA_DATA(a), 16);
+			addr.bytelen = 16;
+			family = AF_INET6;
+			atype = TCP_METRICS_ATTR_ADDR_IPV6;
+		} else
+			return 0;
+	}
+
+	if (f.addr.family && f.addr.bitlen >= 0 &&
+	    inet_addr_match(&addr, &f.addr, f.addr.bitlen))
+		return 0;
+
+	if (f.flushb) {
+		struct nlmsghdr *fn;
+		TCPM_REQUEST(req2, 128, TCP_METRICS_CMD_DEL, NLM_F_REQUEST);
+
+		addattr_l(&req2.n, sizeof(req2), atype, &addr.data,
+			  addr.bytelen);
+
+		if (NLMSG_ALIGN(f.flushp) + req2.n.nlmsg_len > f.flushe) {
+			if (flush_update())
+				return -1;
+		}
+		fn = (struct nlmsghdr *) (f.flushb + NLMSG_ALIGN(f.flushp));
+		memcpy(fn, &req2.n, req2.n.nlmsg_len);
+		fn->nlmsg_seq = ++grth.seq;
+		f.flushp = (((char *) fn) + req2.n.nlmsg_len) - f.flushb;
+		f.flushed++;
+		if (show_stats < 2)
+			return 0;
+	}
+
+	if (f.cmd & (CMD_DEL | CMD_FLUSH))
+		fprintf(fp, "Deleted ");
+
+	fprintf(fp, "%s",
+		format_host(family, RTA_PAYLOAD(a), &addr.data,
+			    abuf, sizeof(abuf)));
+
+	a = attrs[TCP_METRICS_ATTR_AGE];
+	if (a) {
+		__u64 val = rta_getattr_u64(a);
+
+		fprintf(fp, " age %llu.%03llusec",
+			val / 1000, val % 1000);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_TW_TS_STAMP];
+	if (a) {
+		__s32 val = (__s32) rta_getattr_u32(a);
+		__u32 tsval;
+
+		a = attrs[TCP_METRICS_ATTR_TW_TSVAL];
+		tsval = a ? rta_getattr_u32(a) : 0;
+		fprintf(fp, " tw_ts %u/%dsec ago", tsval, val);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_VALS];
+	if (a) {
+		struct rtattr *m[TCP_METRIC_MAX + 1 + 1];
+
+		parse_rtattr_nested(m, TCP_METRIC_MAX + 1, a);
+
+		for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
+			__u32 val;
+
+			a = m[i + 1];
+			if (!a)
+				continue;
+			if (metric_name[i])
+				fprintf(fp, " %s ", metric_name[i]);
+			else
+				fprintf(fp, " metric_%d ", i);
+			val = rta_getattr_u32(a);
+			switch (i) {
+			case TCP_METRIC_RTT:
+				fprintf(fp, "%lluus", (val * 1000ULL) >> 3);
+				break;
+			case TCP_METRIC_RTTVAR:
+				fprintf(fp, "%lluus", (val * 1000ULL) >> 2);
+				break;
+			case TCP_METRIC_SSTHRESH:
+			case TCP_METRIC_CWND:
+			case TCP_METRIC_REORDERING:
+			default:
+				fprintf(fp, "%u", val);
+				break;
+			}
+		}
+	}
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_MSS];
+	if (a)
+		fprintf(fp, " fo_mss %u", rta_getattr_u16(a));
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_SYN_DROPS];
+	if (a) {
+		__u16 syn_loss = rta_getattr_u16(a);
+		__u64 ts;
+
+		a = attrs[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS];
+		ts = a ? rta_getattr_u64(a) : 0;
+
+		fprintf(fp, " fo_syn_drops %u/%llu.%03llusec ago",
+			syn_loss, ts / 1000, ts % 1000);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_COOKIE];
+	if (a) {
+		char cookie[32 + 1];
+		unsigned char *ptr = RTA_DATA(a);
+		int i, max = RTA_PAYLOAD(a);
+
+		if (max > 16)
+			max = 16;
+		cookie[0] = 0;
+		for (i = 0; i < max; i++)
+			sprintf(cookie + i + i, "%02x", ptr[i]);
+		fprintf(fp, " fo_cookie %s", cookie);
+	}
+
+	fprintf(fp, "\n");
+
+	fflush(fp);
+	return 0;
+}
+
+static int tcpm_do_cmd(int cmd, int argc, char **argv)
+{
+	TCPM_REQUEST(req, 1024, TCP_METRICS_CMD_GET, NLM_F_REQUEST);
+	int atype = -1;
+	int ack;
+
+	memset(&f, 0, sizeof(f));
+	f.addr.bitlen = -1;
+	f.addr.family = preferred_family;
+
+	switch (preferred_family) {
+	case AF_UNSPEC:
+	case AF_INET:
+	case AF_INET6:
+		break;
+	default:
+		fprintf(stderr, "Unsupported family:%d\n", preferred_family);
+		return -1;
+	}
+
+	for (; argc > 0; argc--, argv++) {
+		char *who = "address";
+
+		if (strcmp(*argv, "addr") == 0 ||
+		    strcmp(*argv, "address") == 0) {
+			who = *argv;
+			NEXT_ARG();
+		}
+		if (matches(*argv, "help") == 0)
+			usage();
+		if (f.addr.bitlen >= 0)
+			duparg2(who, *argv);
+
+		get_prefix(&f.addr, *argv, preferred_family);
+		if (f.addr.bytelen && f.addr.bytelen * 8 == f.addr.bitlen) {
+			if (f.addr.family == AF_INET)
+				atype = TCP_METRICS_ATTR_ADDR_IPV4;
+			else if (f.addr.family == AF_INET6)
+				atype = TCP_METRICS_ATTR_ADDR_IPV6;
+		}
+		if ((CMD_DEL & cmd) && atype < 0) {
+			fprintf(stderr, "Error: a specific IP address is expected rather than \"%s\"\n",
+				*argv);
+			return -1;
+		}
+
+		argc--; argv++;
+	}
+
+	if (cmd == CMD_DEL && atype < 0)
+		missarg("address");
+
+	/* flush for exact address ? Single del */
+	if (cmd == CMD_FLUSH && atype >= 0)
+		cmd = CMD_DEL;
+
+	/* flush for all addresses ? Single del without address */
+	if (cmd == CMD_FLUSH && f.addr.bitlen <= 0 &&
+	    preferred_family == AF_UNSPEC) {
+		cmd = CMD_DEL;
+		req.g.cmd = TCP_METRICS_CMD_DEL;
+		ack = 1;
+	} else if (cmd == CMD_DEL) {
+		req.g.cmd = TCP_METRICS_CMD_DEL;
+		ack = 1;
+	} else {	/* CMD_FLUSH, CMD_LIST */
+		ack = 0;
+	}
+
+	if (genl_family < 0) {
+		if (rtnl_open_byproto(&grth, 0, NETLINK_GENERIC) < 0) {
+			fprintf(stderr, "Cannot open generic netlink socket\n");
+			exit(1);
+		}
+		genl_family = genl_resolve_family(&grth,
+						  TCP_METRICS_GENL_NAME);
+		if (genl_family < 0)
+			exit(1);
+		req.n.nlmsg_type = genl_family;
+	}
+
+	if (!(cmd & CMD_FLUSH) && (atype >= 0 || (cmd & CMD_DEL))) {
+		if (ack)
+			req.n.nlmsg_flags |= NLM_F_ACK;
+		if (atype >= 0)
+			addattr_l(&req.n, sizeof(req), atype, &f.addr.data,
+				  f.addr.bytelen);
+	} else {
+		req.n.nlmsg_flags |= NLM_F_DUMP;
+	}
+
+	f.cmd = cmd;
+	if (cmd & CMD_FLUSH) {
+		int round = 0;
+		char flushb[4096-512];
+
+		f.flushb = flushb;
+		f.flushp = 0;
+		f.flushe = sizeof(flushb);
+
+		for (;;) {
+			req.n.nlmsg_seq = grth.dump = ++grth.seq;
+			if (rtnl_send(&grth, &req, req.n.nlmsg_len) < 0) {
+				perror("Failed to send flush request");
+				exit(1);
+			}
+			f.flushed = 0;
+			if (rtnl_dump_filter(&grth, process_msg, stdout) < 0) {
+				fprintf(stderr, "Flush terminated\n");
+				exit(1);
+			}
+			if (f.flushed == 0) {
+				if (round == 0) {
+					fprintf(stderr, "Nothing to flush.\n");
+				} else if (show_stats)
+					printf("*** Flush is complete after %d round%s ***\n",
+					       round, round > 1 ? "s" : "");
+				fflush(stdout);
+				return 0;
+			}
+			round++;
+			if (flush_update() < 0)
+				exit(1);
+			if (show_stats) {
+				printf("\n*** Round %d, deleting %d entries ***\n",
+				       round, f.flushed);
+				fflush(stdout);
+			}
+		}
+		return 0;
+	}
+
+	if (ack) {
+		if (rtnl_talk(&grth, &req.n, 0, 0, NULL) < 0)
+			return -2;
+	} else if (atype >= 0) {
+		if (rtnl_talk(&grth, &req.n, 0, 0, &req.n) < 0)
+			return -2;
+		if (process_msg(NULL, &req.n, stdout) < 0) {
+			fprintf(stderr, "Dump terminated\n");
+			exit(1);
+		}
+	} else {
+		req.n.nlmsg_seq = grth.dump = ++grth.seq;
+		if (rtnl_send(&grth, &req, req.n.nlmsg_len) < 0) {
+			perror("Failed to send dump request");
+			exit(1);
+		}
+
+		if (rtnl_dump_filter(&grth, process_msg, stdout) < 0) {
+			fprintf(stderr, "Dump terminated\n");
+			exit(1);
+		}
+	}
+	return 0;
+}
+
+int do_tcp_metrics(int argc, char **argv)
+{
+	int i;
+
+	if (argc < 1)
+		return tcpm_do_cmd(CMD_LIST, 0, NULL);
+	for (i = 0; i < ARRAY_SIZE(cmds); i++) {
+		if (matches(argv[0], cmds[i].name) == 0)
+			return tcpm_do_cmd(cmds[i].code, argc-1, argv+1);
+	}
+	if (matches(argv[0], "help") == 0)
+		usage();
+
+	fprintf(stderr, "Command \"%s\" is unknown, "
+			"try \"ip tcp_metrics help\".\n", *argv);
+	exit(-1);
+}
+
diff --git a/man/man8/Makefile b/man/man8/Makefile
index 4ed3eab..aaf1729 100644
--- a/man/man8/Makefile
+++ b/man/man8/Makefile
@@ -8,7 +8,8 @@ MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 ss.8 \
 	bridge.8 rtstat.8 ctstat.8 nstat.8 routef.8 \
 	ip-address.8 ip-addrlabel.8 ip-l2tp.8 ip-link.8 \
 	ip-maddress.8 ip-monitor.8 ip-mroute.8 ip-neighbour.8 \
-	ip-netns.8 ip-ntable.8 ip-route.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8
+	ip-netns.8 ip-ntable.8 ip-route.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8 \
+	ip-tcp_metrics.8
 
 all: $(TARGETS)
 
diff --git a/man/man8/ip-tcp_metrics.8 b/man/man8/ip-tcp_metrics.8
new file mode 100644
index 0000000..5d2dac8
--- /dev/null
+++ b/man/man8/ip-tcp_metrics.8
@@ -0,0 +1,143 @@
+.TH "IP\-TCP_METRICS" 8 "23 Aug 2012" "iproute2" "Linux"
+.SH "NAME"
+ip-tcp_metrics \- management for TCP Metrics
+.SH "SYNOPSIS"
+.sp
+.ad l
+.in +8
+.ti -8
+.B ip
+.RI "[ " OPTIONS " ]"
+.B tcp_metrics
+.RI "{ " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.BR "ip tcp_metrics" " { " show " | " flush " }
+.IR SELECTOR
+
+.ti -8
+.BR "ip tcp_metrics delete " [ " address " ]
+.IR ADDRESS
+
+.ti -8
+.IR SELECTOR " := "
+.RB "[ [ " address " ] "
+.IR PREFIX " ]"
+
+.SH "DESCRIPTION"
+.B ip tcp_metrics
+is used to manipulate entries in the kernel that keep TCP information
+for IPv4 and IPv6 destinations. The entries are created when
+TCP sockets want to share information for destinations and are
+stored in a cache keyed by the destination address. The saved
+information may include values for metrics (initially obtained from
+routes), recent TSVAL for TIME-WAIT recycling purposes, state for the
+Fast Open feature, etc.
+For performance reasons the cache can not grow above configured limit
+and the older entries are replaced with fresh information, sometimes
+reclaimed and used for new destinations. The kernel never removes
+entries, they can be flushed only with this tool.
+
+.SS ip tcp_metrics show - show cached entries
+
+.TP
+.BI address " PREFIX " (default)
+IPv4/IPv6 prefix or address. If no prefix is provided all entries are shown.
+
+.LP
+The output may contain the following information:
+
+.BI age " <S.MMM>" sec
+- time after the entry was created, reset or updated with metrics
+from sockets. The entry is reset and refreshed on use with metrics from
+route if the metrics are not updated in last hour. Not all cached values
+reset the age on update.
+
+.BI cwnd " <N>"
+- CWND metric value
+
+.BI fo_cookie " <HEX-STRING>"
+- Cookie value received in SYN-ACK to be used by Fast Open for next SYNs
+
+.BI fo_mss " <N>"
+- MSS value received in SYN-ACK to be used by Fast Open for next SYNs
+
+.BI fo_syn_drops " <N>/<S.MMM>" "sec ago"
+- Number of drops of initial outgoing Fast Open SYNs with data
+detected by monitoring the received SYN-ACK after SYN retransmission.
+The seconds show the time after last SYN drop and together with
+the drop count can be used to disable Fast Open for some time.
+
+.BI reordering " <N>"
+- Reordering metric value
+
+.BI rtt " <N>" us
+- RTT metric value
+
+.BI rttvar " <N>" us
+- RTTVAR metric value
+
+.BI ssthresh " <SSTHRESH>"
+- SSTHRESH metric value
+
+.BI tw_ts " <TSVAL>/<SEC>" "sec ago"
+- recent TSVAL and the seconds after saving it into TIME-WAIT socket
+
+.SS ip tcp_metrics delete - delete single entry
+
+.TP
+.BI address " ADDRESS " (default)
+IPv4/IPv6 address. The address is a required argument.
+
+.SS ip tcp_metrics flush - flush entries
+This command flushes the entries selected by some criteria.
+
+.PP
+This command has the same arguments as
+.B show.
+
+.SH "EXAMPLES"
+.PP
+ip tcp_metrics show address 192.168.0.0/24
+.RS 4
+Shows the entries for destinations from subnet
+.RE
+.PP
+ip tcp_metrics show 192.168.0.0/24
+.RS 4
+The same but address keyword is optional
+.RE
+.PP
+ip tcp_metrics
+.RS 4
+Show all is the default action
+.RE
+.PP
+ip tcp_metrics delete 192.168.0.1
+.RS 4
+Removes the entry for 192.168.0.1 from cache.
+.RE
+.PP
+ip tcp_metrics flush 192.168.0.0/24
+.RS 4
+Removes entries for destinations from subnet
+.RE
+.PP
+ip tcp_metrics flush all
+.RS 4
+Removes all entries from cache
+.RE
+.PP
+ip -6 tcp_metrics flush all
+.RS 4
+Removes all IPv6 entries from cache keeping the IPv4 entries.
+.RE
+
+.SH SEE ALSO
+.br
+.BR ip (8)
+
+.SH AUTHOR
+Original Manpage by Julian Anastasov <ja@ssi.bg>
diff --git a/man/man8/ip.8 b/man/man8/ip.8
index 4db8a67..9063049 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -15,7 +15,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels
 .IR OBJECT " := { "
 .BR link " | " addr " | " addrlabel " | " route " | " rule " | " neigh " | "\
  ntable " | " tunnel " | " tuntap " | " maddr " | "  mroute " | " mrule " | "\
- monitor " | " xfrm " | " netns " | "  l2tp " }"
+ monitor " | " xfrm " | " netns " | "  l2tp " | "  tcp_metrics " }"
 .sp
 
 .ti -8
@@ -161,6 +161,10 @@ host addresses.
 - rule in routing policy database.
 
 .TP
+.B tcp_metrics/tcpmetrics
+- manage TCP Metrics
+
+.TP
 .B tunnel
 - tunnel over IP.
 
@@ -220,6 +224,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
 .BR ip-ntable (8),
 .BR ip-route (8),
 .BR ip-rule (8),
+.BR ip-tcp_metrics (8),
 .BR ip-tunnel (8),
 .BR ip-xfrm (8)
 .br
-- 
1.7.3.4

^ permalink raw reply related

* Re: RED tc qdisc not dropping?
From: Ken Savage @ 2012-10-03 22:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1349298766.10199.5.camel@edumazet-glaptop>

Hi Eric,

I'm guessing there were some improvements in the kernel associated
to this area since 2.6...  ;)

I did manage to chain together a 10ms delay with the netem qdisc at
the root, with RED underneath it, and this has at least resulted in
SOME packets being dropped -- and hence a bandwidth "cap" that forms
(naturally, due to queue fill and drop rates) and congestion control
kicking in, to get below this flow's natural tendency to operate
between 25Mbps and 30.

With the 2.6 kernel and tc tools, this chaining has a lot more
effect -- hence why I said there were probably some improvements in
3.4. (Of course there were, that's the whole point of Linux :)

So perhaps you can offer a better way of setting up basic wansim...

eth1: connected to server that generates data.
eth0: connected to client that receives said data.

Ideally, packets egressing out of eth0 can have some latency delay
and/or loss applied to them, to simulate travel time and loss:

# tc .... netem delay 10ms loss 0.001

AND, to simulate being passed through a company's Cisco router, set
up with WRED:

# tc .... red  ... harddrop probability 0.1

AND, be rate limited to some physical value, such what would happen
if we inadvertently used some old 10Mbps switch, or exceeded some
home employees's ADSL modem's internal speed limit:

# tc ... tbf rate 10mbit burst 50kb 

-----

I noticed 'gred' seems to have a bw rate/limit option present now, but
for the life of me I couldn't get gred to actually WORK: it always
seemed to generate errors.  I'm not interested in multiple classes of
service, or prioritizing some traffic over others.  Just treat all my
outbound packets the same, and I'll be happy.


Any ideas how this could best be set up?


Ken






----- Original Message -----
From: "Eric Dumazet" <eric.dumazet@gmail.com>
To: "Ken Savage" <kens1835@shaw.ca>
Cc: netdev@vger.kernel.org
Sent: Wednesday, October 3, 2012 2:12:46 PM
Subject: Re: RED tc qdisc not dropping?

On Wed, 2012-10-03 at 14:14 -0600, Ken Savage wrote:
> Hi there,
> 
> I'm running openSUSE 12.2, using the machine as a router/WANsim device.
> 
> Previously, I was running an older CentOS installation with a 2.6 kernel,
> and my tc-red commands ran just fine, and imposed some bandwidth constraint
> to the packets upon egress.  In 3.4.6, this doesn't seem to be the case
> any longer.
> 
> Without any restrictions, there would be 25-30Mbps of traffic flowing out
> the interface -- this is to give you a sense of the data rate.
> 
> 
> Now, this said, I did notice that the latest RED code has the 'harddrop'
> option that I didn't have under CentOS with kernel 2.6.  So in my attempt
> to see ANYTHING happening with 3.4.6, I entered:
> 
> tc qdisc add dev eth0 root red limit 40000 min 3000 max 9000 avpkt 1000 burst 5 harddrop probability 1
> 
> 
> Issuing 'tc -s -d qdisc show dev eth0', I obtain:
> 
> qdisc red 8006: root refcnt 2 limit 40000b min 3000b max 9000b harddrop ewma 2 probability 0.73242 Scell_log 12
>  Sent 254028472 bytes 207494 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
>   marked 0 early 0 pdrop 0 other 0
> 
> 
> All those zeroes seem a little amiss to me  ;)
> 

Not sure I understand...

Why RED should drop a packet if there is no backlog ?

if you NIC has Gigabit speed, RED will allow Gigabit speed as well.

^ permalink raw reply

* [net 1/5] ixgbe: Fix PTP X540 SDP alignment code for PPS signal
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1349303231-28855-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch fixes a bug in the method used for calculating the trigger
alignment for SDP0 when enabling a PPS output on the X540. The alignment math
wasn't properly taking into account the overflow cyclecounter, and was
misaligning the pin triggers so that two X540 devices synced properly had
mis-aligned SDP pins. This patch fixes the math to calculate the correct
seconds alignment for the PPS signal.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 188 ++++++++++++++-------------
 1 file changed, 99 insertions(+), 89 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 39881cb..58d930d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -106,6 +106,94 @@ static struct sock_filter ptp_filter[] = {
 };
 
 /**
+ * ixgbe_ptp_enable_sdp
+ * @hw: the hardware private structure
+ * @shift: the clock shift for calculating nanoseconds
+ *
+ * this function enables the clock out feature on the sdp0 for the
+ * X540 device. It will create a 1second periodic output that can be
+ * used as the PPS (via an interrupt).
+ *
+ * It calculates when the systime will be on an exact second, and then
+ * aligns the start of the PPS signal to that value. The shift is
+ * necessary because it can change based on the link speed.
+ */
+static void ixgbe_ptp_enable_sdp(struct ixgbe_adapter *adapter)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+	int shift = adapter->cc.shift;
+	u32 esdp, tsauxc, clktiml, clktimh, trgttiml, trgttimh, rem;
+	u64 ns = 0, clock_edge = 0;
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_X540:
+		esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);
+
+		/*
+		 * enable the SDP0 pin as output, and connected to the native
+		 * function for Timesync (ClockOut)
+		 */
+		esdp |= (IXGBE_ESDP_SDP0_DIR |
+			 IXGBE_ESDP_SDP0_NATIVE);
+
+		/*
+		 * enable the Clock Out feature on SDP0, and allow interrupts
+		 * to occur when the pin changes
+		 */
+		tsauxc = (IXGBE_TSAUXC_EN_CLK |
+			  IXGBE_TSAUXC_SYNCLK |
+			  IXGBE_TSAUXC_SDP0_INT);
+
+		/* clock period (or pulse length) */
+		clktiml = (u32)(NSECS_PER_SEC << shift);
+		clktimh = (u32)((NSECS_PER_SEC << shift) >> 32);
+
+		/*
+		 * Account for the cyclecounter wrap-around value by
+		 * using the converted ns value of the current time to
+		 * check for when the next aligned second would occur.
+		 */
+		clock_edge |= (u64)IXGBE_READ_REG(hw, IXGBE_SYSTIML);
+		clock_edge |= (u64)IXGBE_READ_REG(hw, IXGBE_SYSTIMH) << 32;
+		ns = timecounter_cyc2time(&adapter->tc, clock_edge);
+
+		div_u64_rem(ns, NSECS_PER_SEC, &rem);
+		clock_edge += ((NSECS_PER_SEC - (u64)rem) << shift);
+
+		/* specify the initial clock start time */
+		trgttiml = (u32)clock_edge;
+		trgttimh = (u32)(clock_edge >> 32);
+
+		IXGBE_WRITE_REG(hw, IXGBE_CLKTIML, clktiml);
+		IXGBE_WRITE_REG(hw, IXGBE_CLKTIMH, clktimh);
+		IXGBE_WRITE_REG(hw, IXGBE_TRGTTIML0, trgttiml);
+		IXGBE_WRITE_REG(hw, IXGBE_TRGTTIMH0, trgttimh);
+
+		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp);
+		IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, tsauxc);
+
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_TIMESYNC);
+		IXGBE_WRITE_FLUSH(hw);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * ixgbe_ptp_disable_sdp
+ * @hw: the private hardware structure
+ *
+ * this function disables the auxiliary SDP clock out feature
+ */
+static void ixgbe_ptp_disable_sdp(struct ixgbe_hw *hw)
+{
+	IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_TIMESYNC);
+	IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, 0);
+	IXGBE_WRITE_FLUSH(hw);
+}
+
+/**
  * ixgbe_ptp_read - read raw cycle counter (to be used by time counter)
  * @cc: the cyclecounter structure
  *
@@ -187,6 +275,7 @@ static int ixgbe_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 	unsigned long flags;
 	u64 now;
 
+	ixgbe_ptp_disable_sdp(&adapter->hw);
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
 
 	now = timecounter_read(&adapter->tc);
@@ -198,6 +287,8 @@ static int ixgbe_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 			 now);
 
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
+	ixgbe_ptp_enable_sdp(adapter);
+
 	return 0;
 }
 
@@ -246,11 +337,14 @@ static int ixgbe_ptp_settime(struct ptp_clock_info *ptp,
 	ns = ts->tv_sec * 1000000000ULL;
 	ns += ts->tv_nsec;
 
+	ixgbe_ptp_disable_sdp(&adapter->hw);
+
 	/* reset the timecounter */
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
 	timecounter_init(&adapter->tc, &adapter->cc, ns);
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
 
+	ixgbe_ptp_enable_sdp(adapter);
 	return 0;
 }
 
@@ -323,91 +417,6 @@ void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr)
 	}
 }
 
-/**
- * ixgbe_ptp_enable_sdp
- * @hw: the hardware private structure
- * @shift: the clock shift for calculating nanoseconds
- *
- * this function enables the clock out feature on the sdp0 for the
- * X540 device. It will create a 1second periodic output that can be
- * used as the PPS (via an interrupt).
- *
- * It calculates when the systime will be on an exact second, and then
- * aligns the start of the PPS signal to that value. The shift is
- * necessary because it can change based on the link speed.
- */
-static void ixgbe_ptp_enable_sdp(struct ixgbe_hw *hw, int shift)
-{
-	u32 esdp, tsauxc, clktiml, clktimh, trgttiml, trgttimh;
-	u64 clock_edge = 0;
-	u32 rem;
-
-	switch (hw->mac.type) {
-	case ixgbe_mac_X540:
-		esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);
-
-		/*
-		 * enable the SDP0 pin as output, and connected to the native
-		 * function for Timesync (ClockOut)
-		 */
-		esdp |= (IXGBE_ESDP_SDP0_DIR |
-			 IXGBE_ESDP_SDP0_NATIVE);
-
-		/*
-		 * enable the Clock Out feature on SDP0, and allow interrupts
-		 * to occur when the pin changes
-		 */
-		tsauxc = (IXGBE_TSAUXC_EN_CLK |
-			  IXGBE_TSAUXC_SYNCLK |
-			  IXGBE_TSAUXC_SDP0_INT);
-
-		/* clock period (or pulse length) */
-		clktiml = (u32)(NSECS_PER_SEC << shift);
-		clktimh = (u32)((NSECS_PER_SEC << shift) >> 32);
-
-		clock_edge |= (u64)IXGBE_READ_REG(hw, IXGBE_SYSTIML);
-		clock_edge |= (u64)IXGBE_READ_REG(hw, IXGBE_SYSTIMH) << 32;
-
-		/*
-		 * account for the fact that we can't do u64 division
-		 * with remainder, by converting the clock values into
-		 * nanoseconds first
-		 */
-		clock_edge >>= shift;
-		div_u64_rem(clock_edge, NSECS_PER_SEC, &rem);
-		clock_edge += (NSECS_PER_SEC - rem);
-		clock_edge <<= shift;
-
-		/* specify the initial clock start time */
-		trgttiml = (u32)clock_edge;
-		trgttimh = (u32)(clock_edge >> 32);
-
-		IXGBE_WRITE_REG(hw, IXGBE_CLKTIML, clktiml);
-		IXGBE_WRITE_REG(hw, IXGBE_CLKTIMH, clktimh);
-		IXGBE_WRITE_REG(hw, IXGBE_TRGTTIML0, trgttiml);
-		IXGBE_WRITE_REG(hw, IXGBE_TRGTTIMH0, trgttimh);
-
-		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp);
-		IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, tsauxc);
-
-		IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_TIMESYNC);
-		break;
-	default:
-		break;
-	}
-}
-
-/**
- * ixgbe_ptp_disable_sdp
- * @hw: the private hardware structure
- *
- * this function disables the auxiliary SDP clock out feature
- */
-static void ixgbe_ptp_disable_sdp(struct ixgbe_hw *hw)
-{
-	IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_TIMESYNC);
-	IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, 0);
-}
 
 /**
  * ixgbe_ptp_overflow_check - delayed work to detect SYSTIME overflow
@@ -877,10 +886,6 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 	IXGBE_WRITE_REG(hw, IXGBE_SYSTIMH, 0x00000000);
 	IXGBE_WRITE_FLUSH(hw);
 
-	/* now that the shift has been calculated and the systime
-	 * registers reset, (re-)enable the Clock out feature*/
-	ixgbe_ptp_enable_sdp(hw, shift);
-
 	/* store the new cycle speed */
 	adapter->cycle_speed = cycle_speed;
 
@@ -901,6 +906,11 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 			 ktime_to_ns(ktime_get_real()));
 
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
+
+	/* Now that the shift has been calculated and the systime
+	 * registers reset, (re-)enable the Clock out feature
+	 */
+	ixgbe_ptp_enable_sdp(adapter);
 }
 
 /**
-- 
1.7.11.4

^ permalink raw reply related

* [net 0/5][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

This series contains fixes/updates to ixgbe only.  There are three
PTP fixes, polling loop fix and the addition of a device id (X540-AT1).

The following are changes since commit 864499449f256e32815575a9b860267ebefa6d70:
  tg3: Fix sparse warnings.
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net master

Emil Tantilov (1):
  ixgbe: fix poll loop for FDIRCTRL.INIT_DONE bit

Jacob Keller (3):
  ixgbe: Fix PTP X540 SDP alignment code for PPS signal
  ixgbe: (PTP) Fix PPS interrupt code
  ixgbe: fix PTP ethtool timestamping function

joshua.a.hay@intel.com (1):
  ixgbe: add support for X540-AT1

 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c   |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c  |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  15 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c     | 202 +++++++++++------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h    |   1 +
 6 files changed, 112 insertions(+), 114 deletions(-)

-- 
1.7.11.4

^ permalink raw reply

* [net 3/5] ixgbe: fix PTP ethtool timestamping function
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, stable, Jeff Kirsher
In-Reply-To: <1349303231-28855-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch fixes a development issue that occurred due to invalid modes reported
in the ethtool get_ts_info function. The issue is resolved by removing
unsupported modes from the Rx supported list.

CC: stable <stable@vger.kernel.org> [3.5+]
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 4104ea25..56b20d1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -2690,10 +2690,7 @@ static int ixgbe_get_ts_info(struct net_device *dev,
 			(1 << HWTSTAMP_FILTER_NONE) |
 			(1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
 			(1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
-			(1 << HWTSTAMP_FILTER_PTP_V2_SYNC) |
-			(1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
-			(1 << HWTSTAMP_FILTER_PTP_V2_EVENT) |
-			(1 << HWTSTAMP_FILTER_SOME);
+			(1 << HWTSTAMP_FILTER_PTP_V2_EVENT);
 		break;
 #endif /* CONFIG_IXGBE_PTP */
 	default:
-- 
1.7.11.4

^ permalink raw reply related

* [net 2/5] ixgbe: (PTP) Fix PPS interrupt code
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, stable, Jeff Kirsher
In-Reply-To: <1349303231-28855-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

Driver was enabling PPS interrupt even when user wasn't enabling it via the
ptp core. This patch fixes the PPS so that it is only enabled explicitly, and
moves the interrupt enabling code into the correct location in the driver

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Stable <stable@vger.kernel.org> [3.5+]
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 13 +++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  | 96 +++++++++++----------------
 2 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 868af69..c407b2f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2322,6 +2322,12 @@ static inline void ixgbe_irq_enable(struct ixgbe_adapter *adapter, bool queues,
 	default:
 		break;
 	}
+
+#ifdef CONFIG_IXGBE_PTP
+	if (adapter->hw.mac.type == ixgbe_mac_X540)
+		mask |= IXGBE_EIMS_TIMESYNC;
+#endif
+
 	if ((adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) &&
 	    !(adapter->flags2 & IXGBE_FLAG2_FDIR_REQUIRES_REINIT))
 		mask |= IXGBE_EIMS_FLOW_DIR;
@@ -2385,8 +2391,10 @@ static irqreturn_t ixgbe_msix_other(int irq, void *data)
 	}
 
 	ixgbe_check_fan_failure(adapter, eicr);
+
 #ifdef CONFIG_IXGBE_PTP
-	ixgbe_ptp_check_pps_event(adapter, eicr);
+	if (unlikely(eicr & IXGBE_EICR_TIMESYNC))
+		ixgbe_ptp_check_pps_event(adapter, eicr);
 #endif
 
 	/* re-enable the original interrupt state, no lsc, no queues */
@@ -2580,7 +2588,8 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 
 	ixgbe_check_fan_failure(adapter, eicr);
 #ifdef CONFIG_IXGBE_PTP
-	ixgbe_ptp_check_pps_event(adapter, eicr);
+	if (unlikely(eicr & IXGBE_EICR_TIMESYNC))
+		ixgbe_ptp_check_pps_event(adapter, eicr);
 #endif
 
 	/* would disable interrupts here but EIAM disabled it */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 58d930d..d929131 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -106,39 +106,43 @@ static struct sock_filter ptp_filter[] = {
 };
 
 /**
- * ixgbe_ptp_enable_sdp
+ * ixgbe_ptp_setup_sdp
  * @hw: the hardware private structure
- * @shift: the clock shift for calculating nanoseconds
  *
- * this function enables the clock out feature on the sdp0 for the
- * X540 device. It will create a 1second periodic output that can be
- * used as the PPS (via an interrupt).
+ * this function enables or disables the clock out feature on SDP0 for
+ * the X540 device. It will create a 1second periodic output that can
+ * be used as the PPS (via an interrupt).
  *
  * It calculates when the systime will be on an exact second, and then
  * aligns the start of the PPS signal to that value. The shift is
  * necessary because it can change based on the link speed.
  */
-static void ixgbe_ptp_enable_sdp(struct ixgbe_adapter *adapter)
+static void ixgbe_ptp_setup_sdp(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
 	int shift = adapter->cc.shift;
 	u32 esdp, tsauxc, clktiml, clktimh, trgttiml, trgttimh, rem;
 	u64 ns = 0, clock_edge = 0;
 
-	switch (hw->mac.type) {
-	case ixgbe_mac_X540:
+	if ((adapter->flags2 & IXGBE_FLAG2_PTP_PPS_ENABLED) &&
+	    (hw->mac.type == ixgbe_mac_X540)) {
+
+		/* disable the pin first */
+		IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, 0x0);
+		IXGBE_WRITE_FLUSH(hw);
+
 		esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);
 
 		/*
-		 * enable the SDP0 pin as output, and connected to the native
-		 * function for Timesync (ClockOut)
+		 * enable the SDP0 pin as output, and connected to the
+		 * native function for Timesync (ClockOut)
 		 */
 		esdp |= (IXGBE_ESDP_SDP0_DIR |
 			 IXGBE_ESDP_SDP0_NATIVE);
 
 		/*
-		 * enable the Clock Out feature on SDP0, and allow interrupts
-		 * to occur when the pin changes
+		 * enable the Clock Out feature on SDP0, and allow
+		 * interrupts to occur when the pin changes
 		 */
 		tsauxc = (IXGBE_TSAUXC_EN_CLK |
 			  IXGBE_TSAUXC_SYNCLK |
@@ -171,25 +175,10 @@ static void ixgbe_ptp_enable_sdp(struct ixgbe_adapter *adapter)
 
 		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp);
 		IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, tsauxc);
-
-		IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_TIMESYNC);
-		IXGBE_WRITE_FLUSH(hw);
-		break;
-	default:
-		break;
+	} else {
+		IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, 0x0);
 	}
-}
 
-/**
- * ixgbe_ptp_disable_sdp
- * @hw: the private hardware structure
- *
- * this function disables the auxiliary SDP clock out feature
- */
-static void ixgbe_ptp_disable_sdp(struct ixgbe_hw *hw)
-{
-	IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_TIMESYNC);
-	IXGBE_WRITE_REG(hw, IXGBE_TSAUXC, 0);
 	IXGBE_WRITE_FLUSH(hw);
 }
 
@@ -275,7 +264,6 @@ static int ixgbe_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 	unsigned long flags;
 	u64 now;
 
-	ixgbe_ptp_disable_sdp(&adapter->hw);
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
 
 	now = timecounter_read(&adapter->tc);
@@ -287,7 +275,8 @@ static int ixgbe_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 			 now);
 
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
-	ixgbe_ptp_enable_sdp(adapter);
+
+	ixgbe_ptp_setup_sdp(adapter);
 
 	return 0;
 }
@@ -337,14 +326,12 @@ static int ixgbe_ptp_settime(struct ptp_clock_info *ptp,
 	ns = ts->tv_sec * 1000000000ULL;
 	ns += ts->tv_nsec;
 
-	ixgbe_ptp_disable_sdp(&adapter->hw);
-
 	/* reset the timecounter */
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
 	timecounter_init(&adapter->tc, &adapter->cc, ns);
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
 
-	ixgbe_ptp_enable_sdp(adapter);
+	ixgbe_ptp_setup_sdp(adapter);
 	return 0;
 }
 
@@ -375,8 +362,9 @@ static int ixgbe_ptp_enable(struct ptp_clock_info *ptp,
 			if (on)
 				adapter->flags2 |= IXGBE_FLAG2_PTP_PPS_ENABLED;
 			else
-				adapter->flags2 &=
-					~IXGBE_FLAG2_PTP_PPS_ENABLED;
+				adapter->flags2 &= ~IXGBE_FLAG2_PTP_PPS_ENABLED;
+
+			ixgbe_ptp_setup_sdp(adapter);
 			return 0;
 		default:
 			break;
@@ -399,21 +387,12 @@ void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr)
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct ptp_clock_event event;
 
-	event.type = PTP_CLOCK_PPS;
-
-	/* Make sure ptp clock is valid, and PPS event enabled */
-	if (!adapter->ptp_clock ||
-	    !(adapter->flags2 & IXGBE_FLAG2_PTP_PPS_ENABLED))
-		return;
-
-	if (unlikely(eicr & IXGBE_EICR_TIMESYNC)) {
-		switch (hw->mac.type) {
-		case ixgbe_mac_X540:
-			ptp_clock_event(adapter->ptp_clock, &event);
-			break;
-		default:
-			break;
-		}
+	switch (hw->mac.type) {
+	case ixgbe_mac_X540:
+		ptp_clock_event(adapter->ptp_clock, &event);
+		break;
+	default:
+		break;
 	}
 }
 
@@ -831,9 +810,6 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 	if (adapter->cycle_speed == cycle_speed && timinca)
 		return;
 
-	/* disable the SDP clock out */
-	ixgbe_ptp_disable_sdp(hw);
-
 	/**
 	 * Scale the NIC cycle counter by a large factor so that
 	 * relatively small corrections to the frequency can be added
@@ -907,10 +883,11 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 
 	spin_unlock_irqrestore(&adapter->tmreg_lock, flags);
 
-	/* Now that the shift has been calculated and the systime
+	/*
+	 * Now that the shift has been calculated and the systime
 	 * registers reset, (re-)enable the Clock out feature
 	 */
-	ixgbe_ptp_enable_sdp(adapter);
+	ixgbe_ptp_setup_sdp(adapter);
 }
 
 /**
@@ -989,10 +966,11 @@ void ixgbe_ptp_init(struct ixgbe_adapter *adapter)
  */
 void ixgbe_ptp_stop(struct ixgbe_adapter *adapter)
 {
-	ixgbe_ptp_disable_sdp(&adapter->hw);
-
 	/* stop the overflow check task */
-	adapter->flags2 &= ~IXGBE_FLAG2_OVERFLOW_CHECK_ENABLED;
+	adapter->flags2 &= ~(IXGBE_FLAG2_OVERFLOW_CHECK_ENABLED |
+			     IXGBE_FLAG2_PTP_PPS_ENABLED);
+
+	ixgbe_ptp_setup_sdp(adapter);
 
 	if (adapter->ptp_clock) {
 		ptp_clock_unregister(adapter->ptp_clock);
-- 
1.7.11.4

^ permalink raw reply related

* [net 5/5] ixgbe: add support for X540-AT1
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: joshua.a.hay@intel.com, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1349303231-28855-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: "joshua.a.hay@intel.com" <joshua.a.hay@intel.com>

This patch adds device support for Ethernet Controller X540-AT1.

Signed-off-by: Josh Hay <joshua.a.hay@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 90e41db..dbf37e4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -70,6 +70,7 @@ static s32 ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw)
 
 	switch (hw->device_id) {
 	case IXGBE_DEV_ID_X540T:
+	case IXGBE_DEV_ID_X540T1:
 		return 0;
 	case IXGBE_DEV_ID_82599_T3_LOM:
 		return 0;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c407b2f..fa3d552 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -114,6 +114,7 @@ static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_LS), board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599EN_SFP), board_82599 },
 	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_SF_QP), board_82599 },
+	{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540T1), board_X540 },
 	/* required last entry */
 	{0, }
 };
@@ -7054,6 +7055,7 @@ int ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id,
 		is_wol_supported = 1;
 		break;
 	case IXGBE_DEV_ID_X540T:
+	case IXGBE_DEV_ID_X540T1:
 		/* check eeprom to see if enabled wol */
 		if ((wol_cap == IXGBE_DEVICE_CAPS_WOL_PORT0_1) ||
 		    ((wol_cap == IXGBE_DEVICE_CAPS_WOL_PORT0) &&
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 400f86a..0722f33 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -65,6 +65,7 @@
 #define IXGBE_DEV_ID_82599_LS            0x154F
 #define IXGBE_DEV_ID_X540T               0x1528
 #define IXGBE_DEV_ID_82599_SFP_SF_QP     0x154A
+#define IXGBE_DEV_ID_X540T1              0x1560
 
 /* VF Device IDs */
 #define IXGBE_DEV_ID_82599_VF           0x10ED
-- 
1.7.11.4

^ permalink raw reply related

* [net 4/5] ixgbe: fix poll loop for FDIRCTRL.INIT_DONE bit
From: Jeff Kirsher @ 2012-10-03 22:27 UTC (permalink / raw)
  To: davem; +Cc: Emil Tantilov, netdev, gospo, sassmann, shashi-sm, Jeff Kirsher
In-Reply-To: <1349303231-28855-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

The loop in ixgbe_reinit_fdir_tables_82599() only polls for up to 100us
resulting in failures to update the FDIR filter table at 1Gbps and 10Gbps
when under load.

The poll times for FDIRCTRL.INIT_DONE are 55us, 550us and 5.5ms for 10Gbps,
1Gbps and 100Mbps respectively.

This patch sets the wait time to be the same as in ixgbe_fdir_enable_82599()

Reported-by: Bhushan <shashi-sm@users.sf.net>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
index 18bf08c..1077cb2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
@@ -1099,7 +1099,7 @@ s32 ixgbe_reinit_fdir_tables_82599(struct ixgbe_hw *hw)
 		if (IXGBE_READ_REG(hw, IXGBE_FDIRCTRL) &
 		                   IXGBE_FDIRCTRL_INIT_DONE)
 			break;
-		udelay(10);
+		usleep_range(1000, 2000);
 	}
 	if (i >= IXGBE_FDIR_INIT_DONE_POLL) {
 		hw_dbg(hw, "Flow Director Signature poll time exceeded!\n");
-- 
1.7.11.4

^ permalink raw reply related

* Re: [PATCH 0/3] virtio-net: inline header support
From: Rusty Russell @ 2012-10-04  0:11 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Michael S. Tsirkin, Thomas Lendacky, kvm, netdev, linux-kernel,
	virtualization, avi, Sasha Levin
In-Reply-To: <506C192E.5060700@redhat.com>

Paolo Bonzini <pbonzini@redhat.com> writes:

> Il 03/10/2012 08:44, Rusty Russell ha scritto:
>> There's a reason I haven't done this.  I really, really dislike "my
>> implemention isn't broken" feature bits.  We could have an infinite
>> number of them, for each bug in each device.
>
> However, this bug affects (almost) all implementations and (almost) all
> devices.  It even makes sense to reserve a transport feature bit for it
> instead of a device feature bit.
>
> Paolo

Perhaps, but we have to fix the bugs first!

As I said, my torture patch broke qemu immediately.  Since noone has
leapt onto fixing that, I'll take a look now...

Cheers,
Rusty.

^ permalink raw reply

* linux-next: build failure after merge of the net tree
From: Stephen Rothwell @ 2012-10-04  0:18 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, Vipul Pandya, Jay Hernandez

[-- Attachment #1: Type: text/plain, Size: 1089 bytes --]

Hi all,

After merging the net tree, today's linux-next build (powerpc
ppc64_defconfig) failed like this:

drivers/net/ethernet/chelsio/cxgb4/t4_hw.c: In function 't4_memory_rw':
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:420:2: error: implicit declaration of function 'vmalloc' [-Werror=implicit-function-declaration]
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:420:7: warning: assignment makes pointer from integer without a cast [enabled by default]
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:487:2: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration]
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c: In function 'get_vpd_params':
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:530:6: warning: assignment makes pointer from integer without a cast [enabled by default]

Caused by commit 8c357ebd5693 ("cxgb4: Dynamically allocate memory in
t4_memory_rw() and get_vpd_params()").  Missing include of
linux/vmalloc.h.

I have used the net tree from next-20121003 for today.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 2/3] virtio-net: correct capacity math on ring full
From: Rusty Russell @ 2012-10-04  0:24 UTC (permalink / raw)
  To: Michael S. Tsirkin, Thomas Lendacky
  Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm, netdev
In-Reply-To: <f2f7123534aefb33a7a233d9fec56de6e9eeed73.1348824232.git.mst@redhat.com>

"Michael S. Tsirkin" <mst@redhat.com> writes:
> Capacity math on ring full is wrong: we are
> looking at num_sg but that might be optimistic
> because of indirect buffer use.
>
> The implementation also penalizes fast path
> with extra memory accesses for the benefit of
> ring full condition handling which is slow path.
>
> It's easy to query ring capacity so let's do just that.

This path will reduce the actual queue use to worst-case assumptions.
With bufferbloat maybe that's a good thing, but it's true.

If we do this, the code is now wrong:

	/* This can happen with OOM and indirect buffers. */
	if (unlikely(capacity < 0)) {

Because this should now *never* happen.

But I do like the cleanup; returning capacity from add_buf() was always
hacky.  I've got an idea, we'll see what it looks like...

Cheers,
Rusty.

^ permalink raw reply

* Re: [PATCH 1/2] Fix build error caused by broken PCH_PTP module dependency.
From: David Miller @ 2012-10-04  0:43 UTC (permalink / raw)
  To: bhutchings; +Cc: haicheng.li, netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <1349300710.2763.8.camel@bwh-desktop.uk.solarflarecom.com>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 3 Oct 2012 22:45:10 +0100

> I thought of it as being a peripheral feature (which most Solarflare
> hardware doesn't implement) so it made sense for SFC_PTP to be optional
> like SFC_MTD and so on.  But I'm quite happy to use a select instead, if
> you want that to be the convention for all drivers implementing PHC.

I think that consistency might trump those conerns you mentioned, at
least in this case.

^ permalink raw reply

* policy routing vs dnat replies
From: Andy Lutomirski @ 2012-10-04  0:44 UTC (permalink / raw)
  To: Network Development

I hit an annoying policy routing corner case today.  I have a router
with two WAN interfaces (and no BGP).  I have policy routing set up so
that, if a source address matches either of my public networks, then
outgoing packets use the correct interface.  If neither rule matches
(e.g. the source is 0.0.0.0 for source address selection), then the
default route is whichever one I prefer at the moment.  It looks like
this:

$ ip rule
0:	from all lookup local
32766:	from all lookup main
32767:	from all lookup default
40000:	from <net2> lookup isp2
40001:	from <net1> lookup isp1
40010:	from all lookup real_default

The relevant routes are:

default via <gw1> dev eth0.2  table isp1  src <src1>
default via <gw2> dev eth0.3  table isp2  src <src2>
default via <gw2> dev eth0.3  table real_default  src <src2>  metric 101
default via <gw1> dev eth0.2  table real_default  src <src1>  metric 102

(Yes, this is a bit verbose, but I don't know a more concise way to do this.)

This works nicely: if I specifically bind to one of my public
addresses, the corresponding WAN link is used, and if not or if I'm
coming from a private address, then the metrics determine which link
to use.

DNAT breaks it.  I have a rule:
-A PREROUTING -i eth0.2 -d <ip1> -p tcp --dport <port> -j DNAT --to
<internal host>

<ip1> lives on isp1.  Someone sends a SYN.  It gets routed to the
internal host, and that host sends a SYN/ACK back.  The SYN/ACK has a
source ip that isn't on net1 or net2, so it matches the 'lookup
real_default' rule and gets routed to *gw2*.  iptables rewrites the
source address after the routing decision, and my router sends a
packet with a source address belonging to isp1 to isp2's gateway.  The
packet is then dropped.

Is there any way I can either convince iptables to rewrite the source
address in the prerouting hook or to query the conntrack source
address from the policy rule?  Is there a better solution?  I'm
currently using a somewhat gross combination of MARK and fwmark
matches to work around this problem.  One possibility would be:

Thanks,
Andy

P.S.  Linux 3.2 (at least) appears to have a bug: the SYN/ACK has
ctdir ORIGINAL as seen from the the mangle PREROUTING chain.  I'll
send a real bug report for that if I can reproduce it cleanly on a
newer kernel.

^ permalink raw reply

* Re: linux-next: build failure after merge of the net tree
From: David Miller @ 2012-10-04  0:50 UTC (permalink / raw)
  To: sfr; +Cc: netdev, linux-next, linux-kernel, vipul, jay
In-Reply-To: <20121004101833.019598b7e474c7b74e67ff20@canb.auug.org.au>

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 4 Oct 2012 10:18:33 +1000

> After merging the net tree, today's linux-next build (powerpc
> ppc64_defconfig) failed like this:
> 
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c: In function 't4_memory_rw':
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:420:2: error: implicit declaration of function 'vmalloc' [-Werror=implicit-function-declaration]
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:420:7: warning: assignment makes pointer from integer without a cast [enabled by default]
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:487:2: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration]
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c: In function 'get_vpd_params':
> drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:530:6: warning: assignment makes pointer from integer without a cast [enabled by default]
> 
> Caused by commit 8c357ebd5693 ("cxgb4: Dynamically allocate memory in
> t4_memory_rw() and get_vpd_params()").  Missing include of
> linux/vmalloc.h.
> 
> I have used the net tree from next-20121003 for today.

Thanks, I'll fix this as is shown below.

I do have a question though, it is honestly really that much easier to
revert a whole days worth of changes (and therefore not get the code
tested at all) than to simply add the obvious one liner?

It seems to me to be absolutely the wrong tradeoff in these situations.

====================
[PATCH] cxgb4: Fix build error due to missing linux/vmalloc.h include.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 745a1f5..31752b2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -43,6 +43,7 @@
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 #include "cxgb4_uld.h"
 #include "t4_hw.h"
-- 
1.7.11.4

^ permalink raw reply related

* Re: [PATCH 16/20] drivers/net/ethernet/renesas/sh_eth.c: fix error return code
From: Nobuhiro Iwamatsu @ 2012-10-04  0:52 UTC (permalink / raw)
  To: Peter Senna Tschudin
  Cc: davem, kernel-janitors, yoshihiro.shimoda.uh, netdev,
	linux-kernel
In-Reply-To: <1349281090-10013-15-git-send-email-peter.senna@gmail.com>

Peter Senna Tschudin さんは書きました:
> From: Peter Senna Tschudin <peter.senna@gmail.com>
> 
> Convert a nonnegative error return code to a negative one, as returned
> elsewhere in the function.
> 
> A simplified version of the semantic match that finds this problem is as
> follows: (http://coccinelle.lip6.fr/)
> 
> // <smpl>
> (
> if@p1 (\(ret < 0\|ret != 0\))
>  { ... return ret; }
> |
> ret@p1 = 0
> )
> ... when != ret = e1
>     when != &ret
> *if(...)
> {
>   ... when != ret = e2
>       when forall
>  return ret;
> }
> // </smpl>
> 
> Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>

Acked-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>

^ permalink raw reply

* Re: linux-next: build failure after merge of the net tree
From: Stephen Rothwell @ 2012-10-04  1:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-next, linux-kernel, vipul, jay
In-Reply-To: <20121003.205053.1711360628623711137.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1295 bytes --]

Hi David,

On Wed, 03 Oct 2012 20:50:53 -0400 (EDT) David Miller <davem@davemloft.net> wrote:
>
> I do have a question though, it is honestly really that much easier to
> revert a whole days worth of changes (and therefore not get the code
> tested at all) than to simply add the obvious one liner?

Actually, for me it is.  I have a script that does the "use yesterday's
version" for me.  To fix (even a one liner) means bringing up an editor,
commiting, creating the patch and then recommiting it (an implementation
detail) and recording that I need to keep (automatically) applying the
patch in case the maintainer doesn't react quickly.

In this particular case I have been telling people to include vmalloc.h
(and other things like slab.h) over and over for years ... its a pain
that x86 builds indirectly include so much stuff.

> It seems to me to be absolutely the wrong tradeoff in these situations.

I guess for the "current/fixes" tree during the merge window, you are
right.  For the "normal" trees, does a delay of (usually) one day really
matter?

I used to fix all this stuff and it added considerably to the length of
my work day (which currently can be up to 16 hours long) :-(

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 0/3] virtio-net: inline header support
From: Anthony Liguori @ 2012-10-04  1:24 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin, Thomas Lendacky
  Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm, netdev
In-Reply-To: <87vces2gxq.fsf@rustcorp.com.au>

Rusty Russell <rusty@rustcorp.com.au> writes:

> "Michael S. Tsirkin" <mst@redhat.com> writes:
>
>> Thinking about Sasha's patches, we can reduce ring usage
>> for virtio net small packets dramatically if we put
>> virtio net header inline with the data.
>> This can be done for free in case guest net stack allocated
>> extra head room for the packet, and I don't see
>> why would this have any downsides.
>
> I've been wanting to do this for the longest time... but...
>
>> Even though with my recent patches qemu
>> no longer requires header to be the first s/g element,
>> we need a new feature bit to detect this.
>> A trivial qemu patch will be sent separately.
>
> There's a reason I haven't done this.  I really, really dislike "my
> implemention isn't broken" feature bits.  We could have an infinite
> number of them, for each bug in each device.

This is a bug in the specification.

The QEMU implementation pre-dates the specification.  All of the actual
implementations of virtio relied on the semantics of s/g elements and
still do.

What's in the specification really doesn't matter when it doesn't agree
with all of the existing implementations.

Users use implementations, not specifications.  The specification really
ought to be changed here.

Regards,

Anthony Liguori

^ permalink raw reply

* Re: [PATCH 0/3] virtio-net: inline header support
From: Anthony Liguori @ 2012-10-04  1:35 UTC (permalink / raw)
  To: Rusty Russell, Michael S. Tsirkin, Thomas Lendacky
  Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm, netdev
In-Reply-To: <87vces2gxq.fsf@rustcorp.com.au>

Rusty Russell <rusty@rustcorp.com.au> writes:

> "Michael S. Tsirkin" <mst@redhat.com> writes:
>
> There's a reason I haven't done this.  I really, really dislike "my
> implemention isn't broken" feature bits.  We could have an infinite
> number of them, for each bug in each device.
>
> So my plan was to tie this assumption to the new PCI layout.  And have a
> stress-testing patch like the one below in the kernel (see my virtio-wip
> branch for stuff like this).  Turn it on at boot with
> "virtio_ring.torture" on the kernel commandline.
>
> BTW, I've fixed lguest, but my kvm here (Ubuntu precise, kvm-qemu 1.0)
> is too old.  Building the latest git now...
>
> Cheers,
> Rusty.
>
> Subject: virtio: CONFIG_VIRTIO_DEVICE_TORTURE
>
> Virtio devices are not supposed to depend on the framing of the scatter-gather
> lists, but various implementations did.  Safeguard this in future by adding
> an option to deliberately create perverse descriptors.
>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

Ignore framing is really a bad idea.  You want backends to enforce
reasonable framing because guest's shouldn't do silly things with framing.

For instance, with virtio-blk, if you want decent performance, you
absolutely want to avoid bouncing the data.  If you're using O_DIRECT in
the host to submit I/O requests, then it's critical that all of the s/g
elements are aligned to a sector boundary and sized to a sector
boundary.

Yes, QEMU can handle if that's not the case, but it would be insanely
stupid for a guest not to do this.  This is the sort of thing that ought
to be enforced in the specification because a guest cannot perform well
if it doesn't follow these rules.

A spec isn't terribly useful if the result is guest drivers that are
slow.  There's very little to gain by not enforcing rules around framing
and there's a lot to lose if a guest frames incorrectly.

In the rare case where we want to make a framing change, we should use
feature bits like Michael is proposing.

In this case, we should simply say that with the feature bit, the vnet
header can be in the same element as the data but not allow the header
to be spread across multiple elements.

Regards,

Anthony Liguori

>
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 8d5bddb..930a4ea 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -5,6 +5,15 @@ config VIRTIO
>  	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
>  	  CONFIG_RPMSG or CONFIG_S390_GUEST.
>  
> +config VIRTIO_DEVICE_TORTURE
> +	bool "Virtio device torture tests"
> +	depends on VIRTIO && DEBUG_KERNEL
> +	help
> +	  This makes the virtio_ring implementation creatively change
> +	  the format of requests to make sure that devices are
> +	  properly implemented.  This will make your virtual machine
> +	  slow *and* unreliable!  Say N.
> +
>  menu "Virtio drivers"
>  
>  config VIRTIO_PCI
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index e639584..8893753 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -124,6 +124,149 @@ struct vring_virtqueue
>  
>  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
>  
> +#ifdef CONFIG_VIRTIO_DEVICE_TORTURE
> +static bool torture;
> +module_param(torture, bool, 0644);
> +
> +struct torture {
> +	unsigned int orig_out, orig_in;
> +	void *orig_data;
> +	struct scatterlist sg[4];
> +	struct scatterlist orig_sg[];
> +};
> +
> +static size_t tot_len(struct scatterlist sg[], unsigned num)
> +{
> +	size_t len, i;
> +
> +	for (len = 0, i = 0; i < num; i++)
> +		len += sg[i].length;
> +
> +	return len;
> +}
> +
> +static void copy_sg_data(const struct scatterlist *dst, unsigned dnum,
> +			 const struct scatterlist *src, unsigned snum)
> +{
> +	unsigned len;
> +	struct scatterlist s, d;
> +
> +	s = *src;
> +	d = *dst;
> +
> +	while (snum && dnum) {
> +		len = min(s.length, d.length);
> +		memcpy(sg_virt(&d), sg_virt(&s), len);
> +		d.offset += len;
> +		d.length -= len;
> +		s.offset += len;
> +		s.length -= len;
> +		if (!s.length) {
> +			BUG_ON(snum == 0);
> +			src++;
> +			snum--;
> +			s = *src;
> +		}
> +		if (!d.length) {
> +			BUG_ON(dnum == 0);
> +			dst++;
> +			dnum--;
> +			d = *dst;
> +		}
> +	}
> +}
> +
> +static bool torture_replace(struct scatterlist **sg,
> +			     unsigned int *out,
> +			     unsigned int *in,
> +			     void **data,
> +			     gfp_t gfp)
> +{
> +	static size_t seed;
> +	struct torture *t;
> +	size_t outlen, inlen, ourseed, len1;
> +	void *buf;
> +
> +	if (!torture)
> +		return true;
> +
> +	outlen = tot_len(*sg, *out);
> +	inlen = tot_len(*sg + *out, *in);
> +
> +	/* This will break horribly on large block requests. */
> +	t = kmalloc(sizeof(*t) + (*out + *in) * sizeof(t->orig_sg[1])
> +		    + outlen + 1 + inlen + 1, gfp);
> +	if (!t)
> +		return false;
> +
> +	sg_init_table(t->sg, 4);
> +	buf = &t->orig_sg[*out + *in];
> +
> +	memcpy(t->orig_sg, *sg, sizeof(**sg) * (*out + *in));
> +	t->orig_out = *out;
> +	t->orig_in = *in;
> +	t->orig_data = *data;
> +	*data = t;
> +
> +	ourseed = ACCESS_ONCE(seed);
> +	seed++;
> +
> +	*sg = t->sg;
> +	if (outlen) {
> +		/* Split outbuf into two parts, one byte apart. */
> +		*out = 2;
> +		len1 = ourseed % (outlen + 1);
> +		sg_set_buf(&t->sg[0], buf, len1);
> +		buf += len1 + 1;
> +		sg_set_buf(&t->sg[1], buf, outlen - len1);
> +		buf += outlen - len1;
> +		copy_sg_data(t->sg, *out, t->orig_sg, t->orig_out);
> +	}
> +
> +	if (inlen) {
> +		/* Split inbuf into two parts, one byte apart. */
> +		*in = 2;
> +		len1 = ourseed % (inlen + 1);
> +		sg_set_buf(&t->sg[*out], buf, len1);
> +		buf += len1 + 1;
> +		sg_set_buf(&t->sg[*out + 1], buf, inlen - len1);
> +		buf += inlen - len1;
> +	}
> +	return true;
> +}
> +
> +static void *torture_done(struct torture *t)
> +{
> +	void *data;
> +
> +	if (!torture)
> +		return t;
> +
> +	if (t->orig_in)
> +		copy_sg_data(t->orig_sg + t->orig_out, t->orig_in,
> +			     t->sg + (t->orig_out ? 2 : 0), 2);
> +
> +	data = t->orig_data;
> +	kfree(t);
> +	return data;
> +}
> +
> +#else
> +static bool torture_replace(struct scatterlist **sg,
> +			     unsigned int *out,
> +			     unsigned int *in,
> +			     void **data,
> +			     gfp_t gfp)
> +{
> +	return true;
> +}
> +
> +static void *torture_done(void *data)
> +{
> +	return data;
> +}
> +#endif /* CONFIG_VIRTIO_DEVICE_TORTURE */
> +
>  /* Set up an indirect table of descriptors and add it to the queue. */
>  static int vring_add_indirect(struct vring_virtqueue *vq,
>  			      struct scatterlist sg[],
> @@ -213,6 +356,9 @@ int virtqueue_add_buf(struct virtqueue *_vq,
>  
>  	BUG_ON(data == NULL);
>  
> +	if (!torture_replace(&sg, &out, &in, &data, gfp))
> +		return -ENOMEM;
> +
>  #ifdef DEBUG
>  	{
>  		ktime_t now = ktime_get();
> @@ -246,6 +392,7 @@ int virtqueue_add_buf(struct virtqueue *_vq,
>  		if (out)
>  			vq->notify(&vq->vq);
>  		END_USE(vq);
> +		torture_done(data);
>  		return -ENOSPC;
>  	}
>  
> @@ -476,7 +623,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>  #endif
>  
>  	END_USE(vq);
> -	return ret;
> +	return torture_done(ret);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_get_buf);
>  
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: linux-next: build failure after merge of the net tree
From: David Miller @ 2012-10-04  1:47 UTC (permalink / raw)
  To: sfr; +Cc: netdev, linux-next, linux-kernel, vipul, jay
In-Reply-To: <20121004110615.3fda3c25735284c776106d7d@canb.auug.org.au>

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 4 Oct 2012 11:06:15 +1000

> Actually, for me it is.  I have a script that does the "use yesterday's
> version" for me.  To fix (even a one liner) means bringing up an editor,
> commiting, creating the patch and then recommiting it (an implementation
> detail) and recording that I need to keep (automatically) applying the
> patch in case the maintainer doesn't react quickly.

Ok, fair enough.

I'll try to look into making x86 barf when a vmalloc.h include is
missing.

^ permalink raw reply

* Re: [PATCH 3/3] vxlan: virtual extensible lan
From: Jesse Gross @ 2012-10-04  1:54 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev
In-Reply-To: <20121001223254.349753999@vyatta.com>

On Mon, Oct 1, 2012 at 3:32 PM, Stephen Hemminger <shemminger@vyatta.com> wrote:
> --- /dev/null   1970-01-01 00:00:00.000000000 +0000
> +++ b/drivers/net/vxlan.c       2012-10-01 15:08:38.024499080 -0700
> +/* Transmit local packets over Vxlan
> + *
> + * Outer IP header inherits ECN and DF from inner header.
> + * Outer UDP destination is the VXLAN assigned port.
> + *           source port is based on hash of flow if available
> + *                       otherwise use a random value
> + */
> +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
[...]
> +       hash = skb_get_rxhash(skb);
[...]
> +       uh->dest = htons(vxlan_port);
> +       uh->source = hash ? :random32();

I think this may lead to packet reordering in some cases.  If the
protocol is unknown then it may still represent part of a flow and
random source ports could lead to taking different paths.

Also, does it make sense to restrict the range of ports to, say, the
upper half of the range?

^ permalink raw reply

* RE: Possible bug with r8169 driver
From: hayeswang @ 2012-10-04  2:09 UTC (permalink / raw)
  To: 'Francois Romieu'; +Cc: 'Nolwenn', netdev, 'nic_swsd'
In-Reply-To: <20121001185629.GA4922@electric-eye.fr.zoreil.com>

Francois Romieu [mailto:romieu@fr.zoreil.com] 
[...]
> diff --git a/drivers/net/ethernet/realtek/r8169.c 
> b/drivers/net/ethernet/realtek/r8169.c
> index eb81da4..e0f1b8d 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -4271,8 +4271,8 @@ static void rtl_set_rx_mode(struct 
> net_device *dev)
>  		mc_filter[1] = swab32(data);
>  	}
>  
> -	RTL_W32(MAR0 + 4, mc_filter[1]);
> -	RTL_W32(MAR0 + 0, mc_filter[0]);
> +	RTL_W32(MAR0 + 4, 0xffffffff);
> +	RTL_W32(MAR0 + 0, 0xffffffff);
>  
>  	RTL_W32(RxConfig, tmp);
>  }

The RTL_GIGA_MAC_VER_35 is designed without multicast hardware filter, so you
should set IO 0x08 ~ 0x0f to 0xff for receiving all multicast packets without
filtering them.

Best Regards,
Hayes

^ permalink raw reply

* RE: linux-next: build failure after merge of the net tree
From: Jay Hernandez @ 2012-10-04  2:12 UTC (permalink / raw)
  To: David Miller, sfr
  Cc: netdev, linux-next, linux-kernel, Vipul Pandya, Jay Hernandez
In-Reply-To: <20121003.214750.2022020970468412510.davem@davemloft.net>

David and Stephen,

I apologize for not taking the necessary steps to verify our patches
before submission. Ignorance is not an excuse... Anyways David thank you
for fixing the issue for us.

Jay-

-----Original Message-----
From: David Miller [mailto:davem@davemloft.net] 
Sent: Wednesday, October 03, 2012 6:48 PM
To: sfr@canb.auug.org.au
Cc: netdev@vger.kernel.org; linux-next@vger.kernel.org;
linux-kernel@vger.kernel.org; Vipul Pandya; Jay Hernandez
Subject: Re: linux-next: build failure after merge of the net tree

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 4 Oct 2012 11:06:15 +1000

> Actually, for me it is.  I have a script that does the "use 
> yesterday's version" for me.  To fix (even a one liner) means bringing

> up an editor, commiting, creating the patch and then recommiting it 
> (an implementation
> detail) and recording that I need to keep (automatically) applying the

> patch in case the maintainer doesn't react quickly.

Ok, fair enough.

I'll try to look into making x86 barf when a vmalloc.h include is
missing.

^ permalink raw reply

* Re: [PATCH] ipv6: release referenct of ip6_null_entry's dst entry in __ip6_del_rt
From: Gao feng @ 2012-10-04  2:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <505FE24E.4030204@cn.fujitsu.com>

于 2012年09月24日 12:32, Gao feng 写道:
> 于 2012年09月22日 01:16, David Miller 写道:
>> From: Gao feng <gaofeng@cn.fujitsu.com>
>> Date: Thu, 20 Sep 2012 13:25:34 +0800
>>
>>> as we hold dst_entry before we call __ip6_del_rt,
>>> so we should alse call dst_release not only return
>>> -ENOENT when the rt6_info is ip6_null_entry.
>>>
>>> and we already hold the dst entry, so I think it's
>>> safe to call dst_release out of the write-read lock.
>>>
>>> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
>>
>> I cannot find a code path where we can actually end up
>> with an ip6_null_entry here and I'd much rather declare
>> this condition as a bug.
>>
>> Patrick McHardy added this check back in 2006:
>>
>> 	commit 6c813a7297e3af4cd7c3458e09e9ee3d161c6830
>> 	Author: Patrick McHardy <kaber@trash.net>
>> 	Date:   Sun Aug 6 22:22:47 2006 -0700
>>
>> 	    [IPV6]: Fix crash in ip6_del_rt
>>
>> But the ipv6 code has changed substantially since then.
> 
> ip -6 route del ::/0 will end up with an ip6_null_entry.
> so I think this checking and this patch is needed .
> 

Hi David

Can you apply this patch?
thanks!

^ permalink raw reply

* Re: [PATCH] ipv6: release referenct of ip6_null_entry's dst entry in __ip6_del_rt
From: David Miller @ 2012-10-04  3:02 UTC (permalink / raw)
  To: gaofeng; +Cc: netdev
In-Reply-To: <506CFA96.4050207@cn.fujitsu.com>

From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 04 Oct 2012 10:55:18 +0800

> 于 2012年09月24日 12:32, Gao feng 写道:
>> 于 2012年09月22日 01:16, David Miller 写道:
>>> From: Gao feng <gaofeng@cn.fujitsu.com>
>>> Date: Thu, 20 Sep 2012 13:25:34 +0800
>>>
>>>> as we hold dst_entry before we call __ip6_del_rt,
>>>> so we should alse call dst_release not only return
>>>> -ENOENT when the rt6_info is ip6_null_entry.
>>>>
>>>> and we already hold the dst entry, so I think it's
>>>> safe to call dst_release out of the write-read lock.
>>>>
>>>> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
 ...
> Can you apply this patch?

Sorry, I've been really busy this week.  I will try to get to
this soon.

Thanks.

^ permalink raw reply

* Re: [PATCH 0/3] virtio-net: inline header support
From: Rusty Russell @ 2012-10-04  3:34 UTC (permalink / raw)
  To: Anthony Liguori, Michael S. Tsirkin, Thomas Lendacky
  Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm, netdev
In-Reply-To: <87mx033u74.fsf@codemonkey.ws>

Anthony Liguori <anthony@codemonkey.ws> writes:
> Rusty Russell <rusty@rustcorp.com.au> writes:
>
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>>
>>> Thinking about Sasha's patches, we can reduce ring usage
>>> for virtio net small packets dramatically if we put
>>> virtio net header inline with the data.
>>> This can be done for free in case guest net stack allocated
>>> extra head room for the packet, and I don't see
>>> why would this have any downsides.
>>
>> I've been wanting to do this for the longest time... but...
>>
>>> Even though with my recent patches qemu
>>> no longer requires header to be the first s/g element,
>>> we need a new feature bit to detect this.
>>> A trivial qemu patch will be sent separately.
>>
>> There's a reason I haven't done this.  I really, really dislike "my
>> implemention isn't broken" feature bits.  We could have an infinite
>> number of them, for each bug in each device.
>
> This is a bug in the specification.
>
> The QEMU implementation pre-dates the specification.  All of the actual
> implementations of virtio relied on the semantics of s/g elements and
> still do.

lguest fix is pending in my queue.  lkvm and qemu are broken; lkvm isn't
ever going to be merged, so I'm not sure what its status is?  But I'm
determined to fix qemu, and hence my torture patch to make sure this
doesn't creep in again.

> What's in the specification really doesn't matter when it doesn't agree
> with all of the existing implementations.
>
> Users use implementations, not specifications.  The specification really
> ought to be changed here.

I'm sorely tempted, except that we're losing a real optimization because
of this :(

The specification has long contained the footnote:

        The current qemu device implementations mistakenly insist that
        the first descriptor cover the header in these cases exactly, so
        a cautious driver should arrange it so.

I'd like to tie this caveat to the PCI capability change, so this note
will move to the appendix with the old PCI layout.

Cheers,
Rusty.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox