Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3] iproute2: add support for tcp_metrics
From: Julian Anastasov @ 2012-10-03 22:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

	ip tcp_metrics/tcpmetrics

	We support get/del for single entry and dump for
show/flush.

v3:
 - fix rtt/rttvar shifts as suggested by Eric Dumazet
 - show rtt/rttvar usecs as suggested by David Laight

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---

	Stephen, I see correct values for rtt/rttvar,
tcp_metric_set_msecs keeps them into u32 as msecs, so there is
no problem with HZ value.

v2: put family in req.n.nlmsg_type

 include/linux/tcp_metrics.h |   54 ++++++
 ip/Makefile                 |    2 +-
 ip/ip.c                     |    4 +-
 ip/ip_common.h              |    1 +
 ip/tcp_metrics.c            |  429 +++++++++++++++++++++++++++++++++++++++++++
 man/man8/Makefile           |    3 +-
 man/man8/ip-tcp_metrics.8   |  143 ++++++++++++++
 man/man8/ip.8               |    7 +-
 8 files changed, 639 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/tcp_metrics.h
 create mode 100644 ip/tcp_metrics.c
 create mode 100644 man/man8/ip-tcp_metrics.8

diff --git a/include/linux/tcp_metrics.h b/include/linux/tcp_metrics.h
new file mode 100644
index 0000000..cb5157b
--- /dev/null
+++ b/include/linux/tcp_metrics.h
@@ -0,0 +1,54 @@
+/* tcp_metrics.h - TCP Metrics Interface */
+
+#ifndef _LINUX_TCP_METRICS_H
+#define _LINUX_TCP_METRICS_H
+
+#include <linux/types.h>
+
+/* NETLINK_GENERIC related info
+ */
+#define TCP_METRICS_GENL_NAME		"tcp_metrics"
+#define TCP_METRICS_GENL_VERSION	0x1
+
+enum tcp_metric_index {
+	TCP_METRIC_RTT,
+	TCP_METRIC_RTTVAR,
+	TCP_METRIC_SSTHRESH,
+	TCP_METRIC_CWND,
+	TCP_METRIC_REORDERING,
+
+	/* Always last.  */
+	__TCP_METRIC_MAX,
+};
+
+#define TCP_METRIC_MAX	(__TCP_METRIC_MAX - 1)
+
+enum {
+	TCP_METRICS_ATTR_UNSPEC,
+	TCP_METRICS_ATTR_ADDR_IPV4,		/* u32 */
+	TCP_METRICS_ATTR_ADDR_IPV6,		/* binary */
+	TCP_METRICS_ATTR_AGE,			/* msecs */
+	TCP_METRICS_ATTR_TW_TSVAL,		/* u32, raw, rcv tsval */
+	TCP_METRICS_ATTR_TW_TS_STAMP,		/* s32, sec age */
+	TCP_METRICS_ATTR_VALS,			/* nested +1, u32 */
+	TCP_METRICS_ATTR_FOPEN_MSS,		/* u16 */
+	TCP_METRICS_ATTR_FOPEN_SYN_DROPS,	/* u16, count of drops */
+	TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,	/* msecs age */
+	TCP_METRICS_ATTR_FOPEN_COOKIE,		/* binary */
+
+	__TCP_METRICS_ATTR_MAX,
+};
+
+#define TCP_METRICS_ATTR_MAX	(__TCP_METRICS_ATTR_MAX - 1)
+
+enum {
+	TCP_METRICS_CMD_UNSPEC,
+	TCP_METRICS_CMD_GET,
+	TCP_METRICS_CMD_DEL,
+
+	__TCP_METRICS_CMD_MAX,
+};
+
+#define TCP_METRICS_CMD_MAX	(__TCP_METRICS_CMD_MAX - 1)
+
+#endif /* _LINUX_TCP_METRICS_H */
diff --git a/ip/Makefile b/ip/Makefile
index 3bc1516..dfe2e71 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -4,7 +4,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
-    iplink_vxlan.o
+    iplink_vxlan.o tcp_metrics.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index df06d3e..e0f7e60 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -45,7 +45,7 @@ static void usage(void)
 "       ip [ -force ] -batch filename\n"
 "where  OBJECT := { link | addr | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddr | mroute | mrule | monitor | xfrm |\n"
-"                   netns | l2tp }\n"
+"                   netns | l2tp | tcp_metrics }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | bridge | link } |\n"
 "                    -l[oops] { maximum-addr-flush-attempts } |\n"
@@ -78,6 +78,8 @@ static const struct cmd {
 	{ "tunl",	do_iptunnel },
 	{ "tuntap",	do_iptuntap },
 	{ "tap",	do_iptuntap },
+	{ "tcpmetrics",	do_tcp_metrics },
+	{ "tcp_metrics",do_tcp_metrics },
 	{ "monitor",	do_ipmonitor },
 	{ "xfrm",	do_xfrm },
 	{ "mroute",	do_multiroute },
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 5fa2cc0..2fd66b7 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -42,6 +42,7 @@ extern int do_multirule(int argc, char **argv);
 extern int do_netns(int argc, char **argv);
 extern int do_xfrm(int argc, char **argv);
 extern int do_ipl2tp(int argc, char **argv);
+extern int do_tcp_metrics(int argc, char **argv);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
 {
diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
new file mode 100644
index 0000000..34e1d8e
--- /dev/null
+++ b/ip/tcp_metrics.c
@@ -0,0 +1,429 @@
+/*
+ * tcp_metrics.c	"ip tcp_metrics/tcpmetrics"
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, August 2012
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <linux/if.h>
+
+#include <linux/genetlink.h>
+#include <linux/tcp_metrics.h>
+
+#include "utils.h"
+#include "ip_common.h"
+#include "libgenl.h"
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip tcp_metrics/tcpmetrics { COMMAND | help }\n");
+	fprintf(stderr, "       ip tcp_metrics { show | flush } SELECTOR\n");
+	fprintf(stderr, "       ip tcp_metrics delete [ address ] ADDRESS\n");
+	fprintf(stderr, "SELECTOR := [ [ address ] PREFIX ]\n");
+	exit(-1);
+}
+
+/* netlink socket */
+static struct rtnl_handle grth = { .fd = -1 };
+static int genl_family = -1;
+
+#define TCPM_REQUEST(_req, _bufsiz, _cmd, _flags) \
+	GENL_REQUEST(_req, _bufsiz, genl_family, 0, \
+		     TCP_METRICS_GENL_VERSION, _cmd, _flags)
+
+#define CMD_LIST	0x0001	/* list, lst, show		*/
+#define CMD_DEL		0x0002	/* delete, remove		*/
+#define CMD_FLUSH	0x0004	/* flush			*/
+
+static struct {
+	char	*name;
+	int	code;
+} cmds[] = {
+	{	"list",		CMD_LIST	},
+	{	"lst",		CMD_LIST	},
+	{	"show",		CMD_LIST	},
+	{	"delete",	CMD_DEL		},
+	{	"remove",	CMD_DEL		},
+	{	"flush",	CMD_FLUSH	},
+};
+
+static char *metric_name[TCP_METRIC_MAX + 1] = {
+	[TCP_METRIC_RTT]		= "rtt",
+	[TCP_METRIC_RTTVAR]		= "rttvar",
+	[TCP_METRIC_SSTHRESH]		= "ssthresh",
+	[TCP_METRIC_CWND]		= "cwnd",
+	[TCP_METRIC_REORDERING]		= "reordering",
+};
+
+static struct
+{
+	int flushed;
+	char *flushb;
+	int flushp;
+	int flushe;
+	int cmd;
+	inet_prefix addr;
+} f;
+
+static int flush_update(void)
+{
+	if (rtnl_send_check(&grth, f.flushb, f.flushp) < 0) {
+		perror("Failed to send flush request\n");
+		return -1;
+	}
+	f.flushp = 0;
+	return 0;
+}
+
+static int process_msg(const struct sockaddr_nl *who, struct nlmsghdr *n,
+		       void *arg)
+{
+	FILE *fp = (FILE *) arg;
+	struct genlmsghdr *ghdr;
+	struct rtattr *attrs[TCP_METRICS_ATTR_MAX + 1], *a;
+	int len = n->nlmsg_len;
+	char abuf[256];
+	inet_prefix addr;
+	int family, i, atype;
+
+	if (n->nlmsg_type != genl_family)
+		return -1;
+
+	len -= NLMSG_LENGTH(GENL_HDRLEN);
+	if (len < 0)
+		return -1;
+
+	ghdr = NLMSG_DATA(n);
+	if (ghdr->cmd != TCP_METRICS_CMD_GET)
+		return 0;
+
+	parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
+		     len);
+
+	a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
+	if (a) {
+		if (f.addr.family && f.addr.family != AF_INET)
+			return 0;
+		memcpy(&addr.data, RTA_DATA(a), 4);
+		addr.bytelen = 4;
+		family = AF_INET;
+		atype = TCP_METRICS_ATTR_ADDR_IPV4;
+	} else {
+		a = attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+		if (a) {
+			if (f.addr.family && f.addr.family != AF_INET6)
+				return 0;
+			memcpy(&addr.data, RTA_DATA(a), 16);
+			addr.bytelen = 16;
+			family = AF_INET6;
+			atype = TCP_METRICS_ATTR_ADDR_IPV6;
+		} else
+			return 0;
+	}
+
+	if (f.addr.family && f.addr.bitlen >= 0 &&
+	    inet_addr_match(&addr, &f.addr, f.addr.bitlen))
+		return 0;
+
+	if (f.flushb) {
+		struct nlmsghdr *fn;
+		TCPM_REQUEST(req2, 128, TCP_METRICS_CMD_DEL, NLM_F_REQUEST);
+
+		addattr_l(&req2.n, sizeof(req2), atype, &addr.data,
+			  addr.bytelen);
+
+		if (NLMSG_ALIGN(f.flushp) + req2.n.nlmsg_len > f.flushe) {
+			if (flush_update())
+				return -1;
+		}
+		fn = (struct nlmsghdr *) (f.flushb + NLMSG_ALIGN(f.flushp));
+		memcpy(fn, &req2.n, req2.n.nlmsg_len);
+		fn->nlmsg_seq = ++grth.seq;
+		f.flushp = (((char *) fn) + req2.n.nlmsg_len) - f.flushb;
+		f.flushed++;
+		if (show_stats < 2)
+			return 0;
+	}
+
+	if (f.cmd & (CMD_DEL | CMD_FLUSH))
+		fprintf(fp, "Deleted ");
+
+	fprintf(fp, "%s",
+		format_host(family, RTA_PAYLOAD(a), &addr.data,
+			    abuf, sizeof(abuf)));
+
+	a = attrs[TCP_METRICS_ATTR_AGE];
+	if (a) {
+		__u64 val = rta_getattr_u64(a);
+
+		fprintf(fp, " age %llu.%03llusec",
+			val / 1000, val % 1000);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_TW_TS_STAMP];
+	if (a) {
+		__s32 val = (__s32) rta_getattr_u32(a);
+		__u32 tsval;
+
+		a = attrs[TCP_METRICS_ATTR_TW_TSVAL];
+		tsval = a ? rta_getattr_u32(a) : 0;
+		fprintf(fp, " tw_ts %u/%dsec ago", tsval, val);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_VALS];
+	if (a) {
+		struct rtattr *m[TCP_METRIC_MAX + 1 + 1];
+
+		parse_rtattr_nested(m, TCP_METRIC_MAX + 1, a);
+
+		for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
+			__u32 val;
+
+			a = m[i + 1];
+			if (!a)
+				continue;
+			if (metric_name[i])
+				fprintf(fp, " %s ", metric_name[i]);
+			else
+				fprintf(fp, " metric_%d ", i);
+			val = rta_getattr_u32(a);
+			switch (i) {
+			case TCP_METRIC_RTT:
+				fprintf(fp, "%lluus", (val * 1000ULL) >> 3);
+				break;
+			case TCP_METRIC_RTTVAR:
+				fprintf(fp, "%lluus", (val * 1000ULL) >> 2);
+				break;
+			case TCP_METRIC_SSTHRESH:
+			case TCP_METRIC_CWND:
+			case TCP_METRIC_REORDERING:
+			default:
+				fprintf(fp, "%u", val);
+				break;
+			}
+		}
+	}
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_MSS];
+	if (a)
+		fprintf(fp, " fo_mss %u", rta_getattr_u16(a));
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_SYN_DROPS];
+	if (a) {
+		__u16 syn_loss = rta_getattr_u16(a);
+		__u64 ts;
+
+		a = attrs[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS];
+		ts = a ? rta_getattr_u64(a) : 0;
+
+		fprintf(fp, " fo_syn_drops %u/%llu.%03llusec ago",
+			syn_loss, ts / 1000, ts % 1000);
+	}
+
+	a = attrs[TCP_METRICS_ATTR_FOPEN_COOKIE];
+	if (a) {
+		char cookie[32 + 1];
+		unsigned char *ptr = RTA_DATA(a);
+		int i, max = RTA_PAYLOAD(a);
+
+		if (max > 16)
+			max = 16;
+		cookie[0] = 0;
+		for (i = 0; i < max; i++)
+			sprintf(cookie + i + i, "%02x", ptr[i]);
+		fprintf(fp, " fo_cookie %s", cookie);
+	}
+
+	fprintf(fp, "\n");
+
+	fflush(fp);
+	return 0;
+}
+
+static int tcpm_do_cmd(int cmd, int argc, char **argv)
+{
+	TCPM_REQUEST(req, 1024, TCP_METRICS_CMD_GET, NLM_F_REQUEST);
+	int atype = -1;
+	int ack;
+
+	memset(&f, 0, sizeof(f));
+	f.addr.bitlen = -1;
+	f.addr.family = preferred_family;
+
+	switch (preferred_family) {
+	case AF_UNSPEC:
+	case AF_INET:
+	case AF_INET6:
+		break;
+	default:
+		fprintf(stderr, "Unsupported family:%d\n", preferred_family);
+		return -1;
+	}
+
+	for (; argc > 0; argc--, argv++) {
+		char *who = "address";
+
+		if (strcmp(*argv, "addr") == 0 ||
+		    strcmp(*argv, "address") == 0) {
+			who = *argv;
+			NEXT_ARG();
+		}
+		if (matches(*argv, "help") == 0)
+			usage();
+		if (f.addr.bitlen >= 0)
+			duparg2(who, *argv);
+
+		get_prefix(&f.addr, *argv, preferred_family);
+		if (f.addr.bytelen && f.addr.bytelen * 8 == f.addr.bitlen) {
+			if (f.addr.family == AF_INET)
+				atype = TCP_METRICS_ATTR_ADDR_IPV4;
+			else if (f.addr.family == AF_INET6)
+				atype = TCP_METRICS_ATTR_ADDR_IPV6;
+		}
+		if ((CMD_DEL & cmd) && atype < 0) {
+			fprintf(stderr, "Error: a specific IP address is expected rather than \"%s\"\n",
+				*argv);
+			return -1;
+		}
+
+		argc--; argv++;
+	}
+
+	if (cmd == CMD_DEL && atype < 0)
+		missarg("address");
+
+	/* flush for exact address ? Single del */
+	if (cmd == CMD_FLUSH && atype >= 0)
+		cmd = CMD_DEL;
+
+	/* flush for all addresses ? Single del without address */
+	if (cmd == CMD_FLUSH && f.addr.bitlen <= 0 &&
+	    preferred_family == AF_UNSPEC) {
+		cmd = CMD_DEL;
+		req.g.cmd = TCP_METRICS_CMD_DEL;
+		ack = 1;
+	} else if (cmd == CMD_DEL) {
+		req.g.cmd = TCP_METRICS_CMD_DEL;
+		ack = 1;
+	} else {	/* CMD_FLUSH, CMD_LIST */
+		ack = 0;
+	}
+
+	if (genl_family < 0) {
+		if (rtnl_open_byproto(&grth, 0, NETLINK_GENERIC) < 0) {
+			fprintf(stderr, "Cannot open generic netlink socket\n");
+			exit(1);
+		}
+		genl_family = genl_resolve_family(&grth,
+						  TCP_METRICS_GENL_NAME);
+		if (genl_family < 0)
+			exit(1);
+		req.n.nlmsg_type = genl_family;
+	}
+
+	if (!(cmd & CMD_FLUSH) && (atype >= 0 || (cmd & CMD_DEL))) {
+		if (ack)
+			req.n.nlmsg_flags |= NLM_F_ACK;
+		if (atype >= 0)
+			addattr_l(&req.n, sizeof(req), atype, &f.addr.data,
+				  f.addr.bytelen);
+	} else {
+		req.n.nlmsg_flags |= NLM_F_DUMP;
+	}
+
+	f.cmd = cmd;
+	if (cmd & CMD_FLUSH) {
+		int round = 0;
+		char flushb[4096-512];
+
+		f.flushb = flushb;
+		f.flushp = 0;
+		f.flushe = sizeof(flushb);
+
+		for (;;) {
+			req.n.nlmsg_seq = grth.dump = ++grth.seq;
+			if (rtnl_send(&grth, &req, req.n.nlmsg_len) < 0) {
+				perror("Failed to send flush request");
+				exit(1);
+			}
+			f.flushed = 0;
+			if (rtnl_dump_filter(&grth, process_msg, stdout) < 0) {
+				fprintf(stderr, "Flush terminated\n");
+				exit(1);
+			}
+			if (f.flushed == 0) {
+				if (round == 0) {
+					fprintf(stderr, "Nothing to flush.\n");
+				} else if (show_stats)
+					printf("*** Flush is complete after %d round%s ***\n",
+					       round, round > 1 ? "s" : "");
+				fflush(stdout);
+				return 0;
+			}
+			round++;
+			if (flush_update() < 0)
+				exit(1);
+			if (show_stats) {
+				printf("\n*** Round %d, deleting %d entries ***\n",
+				       round, f.flushed);
+				fflush(stdout);
+			}
+		}
+		return 0;
+	}
+
+	if (ack) {
+		if (rtnl_talk(&grth, &req.n, 0, 0, NULL) < 0)
+			return -2;
+	} else if (atype >= 0) {
+		if (rtnl_talk(&grth, &req.n, 0, 0, &req.n) < 0)
+			return -2;
+		if (process_msg(NULL, &req.n, stdout) < 0) {
+			fprintf(stderr, "Dump terminated\n");
+			exit(1);
+		}
+	} else {
+		req.n.nlmsg_seq = grth.dump = ++grth.seq;
+		if (rtnl_send(&grth, &req, req.n.nlmsg_len) < 0) {
+			perror("Failed to send dump request");
+			exit(1);
+		}
+
+		if (rtnl_dump_filter(&grth, process_msg, stdout) < 0) {
+			fprintf(stderr, "Dump terminated\n");
+			exit(1);
+		}
+	}
+	return 0;
+}
+
+int do_tcp_metrics(int argc, char **argv)
+{
+	int i;
+
+	if (argc < 1)
+		return tcpm_do_cmd(CMD_LIST, 0, NULL);
+	for (i = 0; i < ARRAY_SIZE(cmds); i++) {
+		if (matches(argv[0], cmds[i].name) == 0)
+			return tcpm_do_cmd(cmds[i].code, argc-1, argv+1);
+	}
+	if (matches(argv[0], "help") == 0)
+		usage();
+
+	fprintf(stderr, "Command \"%s\" is unknown, "
+			"try \"ip tcp_metrics help\".\n", *argv);
+	exit(-1);
+}
+
diff --git a/man/man8/Makefile b/man/man8/Makefile
index 4ed3eab..aaf1729 100644
--- a/man/man8/Makefile
+++ b/man/man8/Makefile
@@ -8,7 +8,8 @@ MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 ss.8 \
 	bridge.8 rtstat.8 ctstat.8 nstat.8 routef.8 \
 	ip-address.8 ip-addrlabel.8 ip-l2tp.8 ip-link.8 \
 	ip-maddress.8 ip-monitor.8 ip-mroute.8 ip-neighbour.8 \
-	ip-netns.8 ip-ntable.8 ip-route.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8
+	ip-netns.8 ip-ntable.8 ip-route.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8 \
+	ip-tcp_metrics.8
 
 all: $(TARGETS)
 
diff --git a/man/man8/ip-tcp_metrics.8 b/man/man8/ip-tcp_metrics.8
new file mode 100644
index 0000000..5d2dac8
--- /dev/null
+++ b/man/man8/ip-tcp_metrics.8
@@ -0,0 +1,143 @@
+.TH "IP\-TCP_METRICS" 8 "23 Aug 2012" "iproute2" "Linux"
+.SH "NAME"
+ip-tcp_metrics \- management for TCP Metrics
+.SH "SYNOPSIS"
+.sp
+.ad l
+.in +8
+.ti -8
+.B ip
+.RI "[ " OPTIONS " ]"
+.B tcp_metrics
+.RI "{ " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.BR "ip tcp_metrics" " { " show " | " flush " }
+.IR SELECTOR
+
+.ti -8
+.BR "ip tcp_metrics delete " [ " address " ]
+.IR ADDRESS
+
+.ti -8
+.IR SELECTOR " := "
+.RB "[ [ " address " ] "
+.IR PREFIX " ]"
+
+.SH "DESCRIPTION"
+.B ip tcp_metrics
+is used to manipulate entries in the kernel that keep TCP information
+for IPv4 and IPv6 destinations. The entries are created when
+TCP sockets want to share information for destinations and are
+stored in a cache keyed by the destination address. The saved
+information may include values for metrics (initially obtained from
+routes), recent TSVAL for TIME-WAIT recycling purposes, state for the
+Fast Open feature, etc.
+For performance reasons the cache can not grow above configured limit
+and the older entries are replaced with fresh information, sometimes
+reclaimed and used for new destinations. The kernel never removes
+entries, they can be flushed only with this tool.
+
+.SS ip tcp_metrics show - show cached entries
+
+.TP
+.BI address " PREFIX " (default)
+IPv4/IPv6 prefix or address. If no prefix is provided all entries are shown.
+
+.LP
+The output may contain the following information:
+
+.BI age " <S.MMM>" sec
+- time after the entry was created, reset or updated with metrics
+from sockets. The entry is reset and refreshed on use with metrics from
+route if the metrics are not updated in last hour. Not all cached values
+reset the age on update.
+
+.BI cwnd " <N>"
+- CWND metric value
+
+.BI fo_cookie " <HEX-STRING>"
+- Cookie value received in SYN-ACK to be used by Fast Open for next SYNs
+
+.BI fo_mss " <N>"
+- MSS value received in SYN-ACK to be used by Fast Open for next SYNs
+
+.BI fo_syn_drops " <N>/<S.MMM>" "sec ago"
+- Number of drops of initial outgoing Fast Open SYNs with data
+detected by monitoring the received SYN-ACK after SYN retransmission.
+The seconds show the time after last SYN drop and together with
+the drop count can be used to disable Fast Open for some time.
+
+.BI reordering " <N>"
+- Reordering metric value
+
+.BI rtt " <N>" us
+- RTT metric value
+
+.BI rttvar " <N>" us
+- RTTVAR metric value
+
+.BI ssthresh " <SSTHRESH>"
+- SSTHRESH metric value
+
+.BI tw_ts " <TSVAL>/<SEC>" "sec ago"
+- recent TSVAL and the seconds after saving it into TIME-WAIT socket
+
+.SS ip tcp_metrics delete - delete single entry
+
+.TP
+.BI address " ADDRESS " (default)
+IPv4/IPv6 address. The address is a required argument.
+
+.SS ip tcp_metrics flush - flush entries
+This command flushes the entries selected by some criteria.
+
+.PP
+This command has the same arguments as
+.B show.
+
+.SH "EXAMPLES"
+.PP
+ip tcp_metrics show address 192.168.0.0/24
+.RS 4
+Shows the entries for destinations from subnet
+.RE
+.PP
+ip tcp_metrics show 192.168.0.0/24
+.RS 4
+The same but address keyword is optional
+.RE
+.PP
+ip tcp_metrics
+.RS 4
+Show all is the default action
+.RE
+.PP
+ip tcp_metrics delete 192.168.0.1
+.RS 4
+Removes the entry for 192.168.0.1 from cache.
+.RE
+.PP
+ip tcp_metrics flush 192.168.0.0/24
+.RS 4
+Removes entries for destinations from subnet
+.RE
+.PP
+ip tcp_metrics flush all
+.RS 4
+Removes all entries from cache
+.RE
+.PP
+ip -6 tcp_metrics flush all
+.RS 4
+Removes all IPv6 entries from cache keeping the IPv4 entries.
+.RE
+
+.SH SEE ALSO
+.br
+.BR ip (8)
+
+.SH AUTHOR
+Original Manpage by Julian Anastasov <ja@ssi.bg>
diff --git a/man/man8/ip.8 b/man/man8/ip.8
index 4db8a67..9063049 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -15,7 +15,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels
 .IR OBJECT " := { "
 .BR link " | " addr " | " addrlabel " | " route " | " rule " | " neigh " | "\
  ntable " | " tunnel " | " tuntap " | " maddr " | "  mroute " | " mrule " | "\
- monitor " | " xfrm " | " netns " | "  l2tp " }"
+ monitor " | " xfrm " | " netns " | "  l2tp " | "  tcp_metrics " }"
 .sp
 
 .ti -8
@@ -161,6 +161,10 @@ host addresses.
 - rule in routing policy database.
 
 .TP
+.B tcp_metrics/tcpmetrics
+- manage TCP Metrics
+
+.TP
 .B tunnel
 - tunnel over IP.
 
@@ -220,6 +224,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
 .BR ip-ntable (8),
 .BR ip-route (8),
 .BR ip-rule (8),
+.BR ip-tcp_metrics (8),
 .BR ip-tunnel (8),
 .BR ip-xfrm (8)
 .br
-- 
1.7.3.4

^ permalink raw reply related

* Re: [PATCH 1/2] Fix build error caused by broken PCH_PTP module dependency.
From: Ben Hutchings @ 2012-10-03 21:45 UTC (permalink / raw)
  To: David Miller; +Cc: haicheng.li, netdev, tshimizu818, linux-kernel, haicheng.lee
In-Reply-To: <20121002.222238.1580734803370802133.davem@davemloft.net>

On Tue, 2012-10-02 at 22:22 -0400, David Miller wrote:
> From: Haicheng Li <haicheng.li@linux.intel.com>
> Date: Fri, 28 Sep 2012 14:57:38 +0800
> 
> > On 09/28/2012 02:46 PM, David Miller wrote:
> >> From: Haicheng Li<haicheng.li@linux.intel.com>
> >> Date: Fri, 28 Sep 2012 14:41:43 +0800
> >>
> >>> On 09/28/2012 06:09 AM, David Miller wrote:
> >>>> Look at how other people submit patches, do any other patch
> >>>> submissions
> >>>> look like your's having all of this metadata in the message body:
> >>> I'm sorry for it.
> >>>
> >>>> As for this specific patch:
> >>>>
> >>>>> -	depends on PTP_1588_CLOCK_PCH
> >>>>> +	depends on PTP_1588_CLOCK_PCH = PCH_GBE
> >>>>
> >>>> This is not the correct way to ensure that the module'ness of one
> >>>> config option meets the module'ness requirements of another.
> >>>> The correct way is to say something like "&&   (PCH_GBE || PCH_GBE=n)"
> >>>
> >>> This case is a little bit tricky than usual, with PCH_PTP selected,
> >>> the valid config would be either "PTP_1588_CLOCK_PCH=PCH_GBE=m" or
> >>> "PTP_1588_CLOCK_PCH=PCH_GBE=y", and PTP_1588_CLOCK_PCH depends on
> >>> PCH_GBE.
> >>
> >> And a simple "&&  PCH_GBE" should accomplish this, no?
> > No sir. it's actually same with the original Kconfig (by a if
> > PCH_GBE"), it just failed with this config:
> > 
> >         CONFIG_PCH_GBE=y
> >         CONFIG_PCH_PTP=y
> >         CONFIG_PTP_1588_CLOCK=m
> 
> The correct fix is to make the Kconfig entry for PCH_PTP use
> a "select PTP_1588_CLOCK" instead of "depends PTP_1588_CLOCK"
> 
> I'll apply this fix.
> 
> The is another, extremely convoluted, way to do this, which is
> what the SFC driver does which is:
> 
> depends on SFC && PTP_1588_CLOCK && !(SFC=y && PTP_1588_CLOCK=m)
> 
> but that looks horrible to me.

I thought of it as being a peripheral feature (which most Solarflare
hardware doesn't implement) so it made sense for SFC_PTP to be optional
like SFC_MTD and so on.  But I'm quite happy to use a select instead, if
you want that to be the convention for all drivers implementing PHC.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: You have to fix this
From: Randy Dunlap @ 2012-10-03 21:17 UTC (permalink / raw)
  To: Vipul Pandya; +Cc: David Miller, netdev@vger.kernel.org
In-Reply-To: <506C0F75.4080309@chelsio.com>

On 10/03/2012 03:12 AM, Vipul Pandya wrote:

> 
> 
> On 28-09-2012 23:13, David Miller wrote:
>> From: Vipul Pandya <vipul@chelsio.com>
>> Date: Fri, 28 Sep 2012 17:29:22 +0530
>>
>>> Please let me know how else would I get above warning message?
>>
>> Maybe your compiler is too old.  Does -Wframe-larger-than= show up in your
>> build logs with "make V=1"?
>>
> 
> I am using gcc version 4.4.4 20100726. Yes -Wframe-larger-than=2048 does
> show up in my build logs as shown below:
> ===
> gcc -Wp,-MD,arch/x86/kernel/.hw_breakpoint.o.d  -nostdinc -isystem
> /usr/lib/gcc/x86_64-redhat-linux/4.4.4/include
> -I/root/git_kernel_tree/net-next/arch/x86/include
> -Iarch/x86/include/generated -Iinclude  -include
> /root/git_kernel_tree/net-next/include/linux/kconfig.h -D__KERNEL__
> -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing
> -fno-common -Werror-implicit-function-declaration -Wno-format-security
> -fno-delete-null-pointer-checks -O2 -m64 -mtune=generic -mno-red-zone
> -mcmodel=kernel -funit-at-a-time -maccumulate-outgoing-args
> -DCONFIG_AS_CFI=1 -DCONFIG_AS_CFI_SIGNAL_FRAME=1
> -DCONFIG_AS_CFI_SECTIONS=1 -DCONFIG_AS_FXSAVEQ=1 -DCONFIG_AS_AVX=1 -pipe
> -Wno-sign-compare -fno-asynchronous-unwind-tables -mno-sse -mno-mmx
> -mno-sse2 -mno-3dnow -mno-avx -Wframe-larger-than=2048
> -fno-stack-protector -Wno-unused-but-set-variable -fomit-frame-pointer
> -Wdeclaration-after-statement -Wno-pointer-sign -fno-strict-overflow
> -fconserve-stack -DCC_HAVE_ASM_GOTO    -D"KBUILD_STR(s)=#s"
> -D"KBUILD_BASENAME=KBUILD_STR(hw_breakpoint)"
> -D"KBUILD_MODNAME=KBUILD_STR(hw_breakpoint)" -c -o
> arch/x86/kernel/hw_breakpoint.o arch/x86/kernel/hw_breakpoint.c
> ===


Regardless, you could just go ahead and fix it and use
scripts/checkstack.pl (or make checkstack) to check it for stack
usage as well as test the binary code to make sure that it works OK.

-- 
~Randy

^ permalink raw reply

* Re: RED tc qdisc not dropping?
From: Eric Dumazet @ 2012-10-03 21:12 UTC (permalink / raw)
  To: Ken Savage; +Cc: netdev
In-Reply-To: <986427880.11879793.1349295290592.JavaMail.root@cds046>

On Wed, 2012-10-03 at 14:14 -0600, Ken Savage wrote:
> Hi there,
> 
> I'm running openSUSE 12.2, using the machine as a router/WANsim device.
> 
> Previously, I was running an older CentOS installation with a 2.6 kernel,
> and my tc-red commands ran just fine, and imposed some bandwidth constraint
> to the packets upon egress.  In 3.4.6, this doesn't seem to be the case
> any longer.
> 
> Without any restrictions, there would be 25-30Mbps of traffic flowing out
> the interface -- this is to give you a sense of the data rate.
> 
> 
> Now, this said, I did notice that the latest RED code has the 'harddrop'
> option that I didn't have under CentOS with kernel 2.6.  So in my attempt
> to see ANYTHING happening with 3.4.6, I entered:
> 
> tc qdisc add dev eth0 root red limit 40000 min 3000 max 9000 avpkt 1000 burst 5 harddrop probability 1
> 
> 
> Issuing 'tc -s -d qdisc show dev eth0', I obtain:
> 
> qdisc red 8006: root refcnt 2 limit 40000b min 3000b max 9000b harddrop ewma 2 probability 0.73242 Scell_log 12
>  Sent 254028472 bytes 207494 pkt (dropped 0, overlimits 0 requeues 0)
>  backlog 0b 0p requeues 0
>   marked 0 early 0 pdrop 0 other 0
> 
> 
> All those zeroes seem a little amiss to me  ;)
> 

Not sure I understand...

Why RED should drop a packet if there is no backlog ?

if you NIC has Gigabit speed, RED will allow Gigabit speed as well.

^ permalink raw reply

* Re: Possible networking regression in 3.6.0
From: Julian Anastasov @ 2012-10-03 20:57 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, chris2553, netdev, gpiez, davej
In-Reply-To: <20121002.231037.581571797430134988.davem@davemloft.net>


	Hello,

On Tue, 2 Oct 2012, David Miller wrote:

> From: Julian Anastasov <ja@ssi.bg>
> Date: Wed, 3 Oct 2012 02:24:53 +0300 (EEST)
> 
> > 	Can it be a problem related to fib_info reuse
> > from different routes. For example, when local IP address
> > is created for subnet we have:
> > 
> > broadcast 192.168.0.255 dev DEV  proto kernel  scope link  src 192.168.0.1
> > 192.168.0.0/24 dev DEV  proto kernel  scope link  src 192.168.0.1
> > local 192.168.0.1 dev DEV  proto kernel  scope host  src 192.168.0.1
> > 
> > 	The "dev DEV  proto kernel  scope link  src 192.168.0.1" is
> > a reused fib_info structure where we put cached routes.
> > The result can be same fib_info for 192.168.0.255 and
> > 192.168.0.0/24. RTN_BROADCAST is cached only for input
> > routes. Incoming broadcast to 192.168.0.255 can be cached
> > and can cause problems for traffic forwarded to 192.168.0.0/24.
> > So, this patch should solve the problem because it
> > separates the broadcast from unicast traffic.
> 
> Now I understand the problem.
> 
> I think the way to fix this is to add cfg->fc_type as another
> thing that fib_info objects are key'd by.
> 
> I think it also would fix your obscure output multicast case too.

	Agreed. I don't see problem with this idea.
It will avoid confusions with rt_type.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Yinghai Lu @ 2012-10-03 20:37 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Bjorn Helgaas, Greg Kroah-Hartman, linux-pci, linux-kernel,
	Don Dutile, yuvalmin, bhutchings, gregory.v.rose, davem,
	Jeff Kirsher, Jesse Brandeburg, John Fastabend, e1000-devel,
	netdev
In-Reply-To: <506C8837.5070902@intel.com>

On Wed, Oct 3, 2012 at 11:47 AM, Alexander Duyck
<alexander.h.duyck@intel.com> wrote:
> The ixgbe_set_max_vfs function has several issues.  The two big ones are
> that this function assumes it can just enable/disable SR-IOV without any
> other changes being necessary which is not the case.  I would recommend
> looking at ixgbe_setup_tc for how to do this properly.  Secondly is the
> fact that this code will change the PF network device and as such
> sections of the code should be called with the RTNL lock held.  In
> addition I believe you have to disable SR-IOV before enabling it again
> with a different number of VFs.

yes, agreed.

>
> Below is a link to one of the early patches for igb when we were first
> introducing SR-IOV, and the in-driver sysfs value had been rejected.  I
> figure it might be useful as it was also using sysfs to enable/disable
> VFs.  It however doesn't have the correct locking on changing the queues
> and as such will likely throw an error if you were to implement it the
> same way now:
> http://lists.openwall.net/netdev/2009/04/08/34

yes, that is almost there if put that in-driver value to per device
value and ops.

Thanks

Yinghai

^ permalink raw reply

* Re: [PATCH net 1/1] bnx2x: fix ring size for 10G functions
From: David Miller @ 2012-10-03 20:36 UTC (permalink / raw)
  To: yuvalmin; +Cc: netdev, ariele, eilong
In-Reply-To: <1349274179-18827-1-git-send-email-yuvalmin@broadcom.com>

From: "Yuval Mintz" <yuvalmin@broadcom.com>
Date: Wed, 3 Oct 2012 16:22:59 +0200

> Commit d760fc37b0f74502b3f748951f22c6683b079a8e caused 
> 1G functions to allocate rx rings which were 1/10 of the 
> size of 10G functions' rx rings.
> 
> However, it also caused 10G functions on 5771x boards to 
> allocate small rings, which limits their possible (default) 
> rx throughput. This patch causes all 10G functions to use 
> rings of intended length by default.
> 
> Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
> Signed-off-by: Ariel Elior <ariele@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Applied.

^ permalink raw reply

* Re: [PATCH] cxgb4: Dynamically allocate memory in t4_memory_rw() and get_vpd_params()
From: David Miller @ 2012-10-03 20:35 UTC (permalink / raw)
  To: vipul; +Cc: netdev, divy, dm, leedom, felix, jay
In-Reply-To: <1349270552-6678-1-git-send-email-vipul@chelsio.com>

From: Vipul Pandya <vipul@chelsio.com>
Date: Wed,  3 Oct 2012 18:52:32 +0530

> This patch changes memory allocation to reduce stack footprint
> 
> Signed-off-by: Jay Hernandez <jay@chelsio.com>
> Signed-off-by: Vipul Pandya <vipul@chelsio.com>

Applied.

^ permalink raw reply

* Re: [RFC PATCH 1/2] sctp: fix a typo in prototype of __sctp_rcv_lookup()
From: David Miller @ 2012-10-03 20:28 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: linux-sctp, vyasevich, netdev
In-Reply-To: <1349279002-4008-1-git-send-email-nicolas.dichtel@6wind.com>

From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed,  3 Oct 2012 17:43:21 +0200

> Just to avoid confusion when people only reads this prototype.
> 
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>

I think we should apply this one, regardless of what happens
to patch #2 in this series.

^ permalink raw reply

* Re: drivers/net/cris/eth_v10.c:1715:2: error: too many arguments to function 'e100rxtx_interrupt'
From: Jesper Nilsson @ 2012-10-03 20:26 UTC (permalink / raw)
  To: David Miller
  Cc: fengguang.wu@intel.com, kernel-janitors@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <20121003.144040.1719867946304137902.davem@davemloft.net>

On Wed, Oct 03, 2012 at 08:40:40PM +0200, David Miller wrote:
> From: Jesper Nilsson <jesper.nilsson@axis.com>
> Date: Wed, 3 Oct 2012 12:46:52 +0200
> 
> > On Fri, Sep 28, 2012 at 03:06:08PM +0200, Fengguang Wu wrote:
> >> Hi Jesper,
> > 
> > Hi!
> > 
> >> FYI, a rather old build bug that's introduced by
> >> 
> >> bafef0a cris build fixes: update eth_v10.c ethernet driver
> >> 
> >> All error/warnings:
> >> 
> >> drivers/net/cris/eth_v10.c: In function 'e100_netpoll':
> >> drivers/net/cris/eth_v10.c:1715:2: error: too many arguments to function 'e100rxtx_interrupt'
> >> drivers/net/cris/eth_v10.c:1131:1: note: declared here
> > 
> > Yep, I can't figure out why the followup patches never reached mainline,
> > but we have fixes for exactly that in our in-house tree.
> > I'll push some move patches after this merge window.
> 
> It's a bug fix, even worse a build fix, why want until after the merge
> window?

Aye, true, I'll just have to make sure I don't get any other change from
the inhouse tree.

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply

* Re: [PATCH 1/20] drivers/net/ethernet/dec/tulip/dmfe.c: fix error return code
From: Grant Grundler @ 2012-10-03 20:16 UTC (permalink / raw)
  To: Peter Senna Tschudin
  Cc: Grant Grundler, kernel-janitors, netdev, linux-kernel
In-Reply-To: <1349281090-10013-1-git-send-email-peter.senna@gmail.com>

On Wed, Oct 3, 2012 at 9:17 AM, Peter Senna Tschudin
<peter.senna@gmail.com> wrote:
> From: Peter Senna Tschudin <peter.senna@gmail.com>
>
> Convert a nonnegative error return code to a negative one, as returned
> elsewhere in the function.
>
> A simplified version of the semantic match that finds this problem is as
> follows: (http://coccinelle.lip6.fr/)
>
> // <smpl>
> (
> if@p1 (\(ret < 0\|ret != 0\))
>  { ... return ret; }
> |
> ret@p1 = 0
> )
> ... when != ret = e1
>     when != &ret
> *if(...)
> {
>   ... when != ret = e2
>       when forall
>  return ret;
> }
> // </smpl>
>
> Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>

Thanks! Looks good to me.

Acked-by: Grant Grundler <grundler@parisc-linux.org>

cheers,
grant

>
> ---
>  drivers/net/ethernet/dec/tulip/dmfe.c |   12 +++++++++---
>  1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
> index 4d6fe60..d23755e 100644
> --- a/drivers/net/ethernet/dec/tulip/dmfe.c
> +++ b/drivers/net/ethernet/dec/tulip/dmfe.c
> @@ -446,13 +446,17 @@ static int __devinit dmfe_init_one (struct pci_dev *pdev,
>         /* Allocate Tx/Rx descriptor memory */
>         db->desc_pool_ptr = pci_alloc_consistent(pdev, sizeof(struct tx_desc) *
>                         DESC_ALL_CNT + 0x20, &db->desc_pool_dma_ptr);
> -       if (!db->desc_pool_ptr)
> +       if (!db->desc_pool_ptr) {
> +               err = -ENOMEM;
>                 goto err_out_res;
> +       }
>
>         db->buf_pool_ptr = pci_alloc_consistent(pdev, TX_BUF_ALLOC *
>                         TX_DESC_CNT + 4, &db->buf_pool_dma_ptr);
> -       if (!db->buf_pool_ptr)
> +       if (!db->buf_pool_ptr) {
> +               err = -ENOMEM;
>                 goto err_out_free_desc;
> +       }
>
>         db->first_tx_desc = (struct tx_desc *) db->desc_pool_ptr;
>         db->first_tx_desc_dma = db->desc_pool_dma_ptr;
> @@ -462,8 +466,10 @@ static int __devinit dmfe_init_one (struct pci_dev *pdev,
>         db->chip_id = ent->driver_data;
>         /* IO type range. */
>         db->ioaddr = pci_iomap(pdev, 0, 0);
> -       if (!db->ioaddr)
> +       if (!db->ioaddr) {
> +               err = -ENOMEM;
>                 goto err_out_free_buf;
> +       }
>
>         db->chip_revision = pdev->revision;
>         db->wol_mode = 0;
>

^ permalink raw reply

* RED tc qdisc not dropping?
From: Ken Savage @ 2012-10-03 20:14 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1643425136.11868012.1349294650298.JavaMail.root@cds046>

Hi there,

I'm running openSUSE 12.2, using the machine as a router/WANsim device.

Previously, I was running an older CentOS installation with a 2.6 kernel,
and my tc-red commands ran just fine, and imposed some bandwidth constraint
to the packets upon egress.  In 3.4.6, this doesn't seem to be the case
any longer.

Without any restrictions, there would be 25-30Mbps of traffic flowing out
the interface -- this is to give you a sense of the data rate.

Now, this said, I did notice that the latest RED code has the 'harddrop'
option that I didn't have under CentOS with kernel 2.6.  So in my attempt
to see ANYTHING happening with 3.4.6, I entered:

tc qdisc add dev eth0 root red limit 40000 min 3000 max 9000 avpkt 1000 burst 5 harddrop probability 1

Issuing 'tc -s -d qdisc show dev eth0', I obtain:

qdisc red 8006: root refcnt 2 limit 40000b min 3000b max 9000b harddrop ewma 2 probability 0.73242 Scell_log 12
 Sent 254028472 bytes 207494 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  marked 0 early 0 pdrop 0 other 0

All those zeroes seem a little amiss to me  ;)

Any ideas?

Thanks,

Ken Savage
kens1835@shaw.ca

^ permalink raw reply

* Re: [PATCH 4/5] ucc_geth: Increase RX ring buffer from 32 to 64
From: Joakim Tjernlund @ 2012-10-03 20:13 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20121003.153855.325707079634367524.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote on 2012/10/03 21:38:55:
>
> From: Joakim Tjernlund <joakim.tjernlund@transmode.se>
> Date: Wed, 3 Oct 2012 15:17:58 +0200
>
> > Ping? Got no comments and I can see it in net or net-next trees either.
>
> No patch will be applied to the tree when one of the indiviual patches
> get feedback and request changes from you.
>
> When an individual patch must be redone, you must resend the entire
> series not just the individual patch which changed.

oh, I see what I did wrong. I sent them as series which may have internal dependencies.
However, each patch can be applied on its own.

>
> You also never need to ask the kind of question you are asking here,
> you simply need to look into patch work to see what the state of
> your patches is:
>
> I maintain this state exactly so people don't need to waste precious
> developer time asking "what is the state of my patch" like you are
> making me do right here.
>
> See:
>
> http://patchwork.ozlabs.org/project/netdev/list/?state=*&q=ucc_geth

Ahh, better bookmark this page for future use.

>
> And as you can see this entire series is marked as either "RFC"
> or "Changes Requested"

Yes, I see that now.
I guess I should resend each patch separately then?
(Possibly wait until I have figured out the lockless TX stuff)

>
> Even worse, you did this for not one but several of the patches you
> posted.

Sorry about that.

 Jocke

^ permalink raw reply

* Re: [PATCH 4/5] ucc_geth: Increase RX ring buffer from 32 to 64
From: David Miller @ 2012-10-03 19:38 UTC (permalink / raw)
  To: joakim.tjernlund; +Cc: netdev
In-Reply-To: <OF4BE93D25.4AE817F3-ONC1257A8C.004909F7-C1257A8C.00490EAC@transmode.se>

From: Joakim Tjernlund <joakim.tjernlund@transmode.se>
Date: Wed, 3 Oct 2012 15:17:58 +0200

> Ping? Got no comments and I can see it in net or net-next trees either.

No patch will be applied to the tree when one of the indiviual patches
get feedback and request changes from you.

When an individual patch must be redone, you must resend the entire
series not just the individual patch which changed.

You also never need to ask the kind of question you are asking here,
you simply need to look into patch work to see what the state of
your patches is:

I maintain this state exactly so people don't need to waste precious
developer time asking "what is the state of my patch" like you are
making me do right here.

See:

http://patchwork.ozlabs.org/project/netdev/list/?state=*&q=ucc_geth

And as you can see this entire series is marked as either "RFC"
or "Changes Requested"

Even worse, you did this for not one but several of the patches you
posted.

^ permalink raw reply

* Re: [PATCH] udp: increment UDP_MIB_NOPORTS in mcast receive
From: David Miller @ 2012-10-03 19:30 UTC (permalink / raw)
  To: dlstevens; +Cc: eric.dumazet, chris2553, davej, gpiez, ja, netdev, netdev-owner
In-Reply-To: <OFFFC99DDA.1BD04FA3-ON85257A8C.005A8E9B-85257A8C.00604444@us.ibm.com>

From: David Stevens <dlstevens@us.ibm.com>
Date: Wed, 3 Oct 2012 13:31:30 -0400

> Eric Dumazet <eric.dumazet@gmail.com> wrote on 10/03/2012 11:29:13 AM:
> 
>> >         Of course. I think our difference is on the definition of 
>> > "receives".
>> 
>> A receive is a packet delivered to this host.
>> Interface being promiscuous or not doesnt really matter.
> 
>         A receive is a packet *addressed* to this host.

Although I'm largely ambivalent, this one sentence tipped me over
towards David's side on this issue.

But this is easy to resolve Eric, just simply make a new custom
counter that counts these new cases you care about and document it
properly.

Thanks.

^ permalink raw reply

* Kernel recieves DNS reply, but doesn't deliver it to a waiting application
From: Andrew Savchenko @ 2012-10-03 19:25 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 2262 bytes --]

Hello,

I encountered a very weird bug: after a while of uptime kernel stops to deliver
DNS reply to applications. Tcpdump shows that correct reply is recieved, but 
strace shows inquiring application never recieves it and ends with timeout,
epoll_wait() always returns 0:
a slice from: $ host kernel.org 8.8.8.8:

sendmsg(20, {msg_name(16)={sa_family=AF_INET, sin_port=htons(53),
sin_addr=inet_addr("8.8.8.8")}, msg_iov(1)=[{"\266\344\1\0\0\1\0\0\0\0\0\0\6k
ernel\3org\0\0\1\0\1", 28}], msg_controllen=0, msg_flags=0}, 0) = 28            
epoll_wait(3, {}, 64, 0)                = 0                                     
epoll_wait(3, {}, 64, 4999)             = 0

Though tcpdump shows a normal reply:

20:28:44.162897 IP 10.7.74.7.43167 > 8.8.8.8.domain: 46820+ A? kernel.org. (28) 
20:28:44.221308 IP 8.8.8.8.domain > 10.7.74.7.43167: 46820 1/0/0 A 149.20.4.69
(44)

After this bug has occured, it is no longer possible to perform DNS request on
the crippled system. I tried to stop/restart all network-related daemons, to
recreate network interfaces whenever possible (e.g. pppX devices), but with no
help. I use iptables and ebtables on this host, but reseting them (flushing all
chains, removing user chains, setting all policies to ACCEPT) doesn't help. The
only worknig solution is to reboot the system.

This bug happens rarely and randomly (about once in 7-12 days on 24x7 available
production system), but I had it 5 times already. Due to rare and random nature
of the bug I can't bisect it.

This problem occured after I updated vanilla kernel from 2.6.39.4 to 3.4.6.
Afterward I updated kernel to 3.4.10 in the hope that this will fix the
problem, but with no result. (I updated kernel due to commit
2ce42ec4ef551b08d2e5d26775d838ac640f82ad, which describes somewhat similar
issue, though I don't use I/OAT engine due to lack of hardware support.)

More details, attached trace files and kernel configs are available at bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=48081

In a few days I'll try 3.4.12 (I need to rebuild kernel anyway due to unrelated
issue) and will report if this bug will occur again. But please note it may
take several weeks to check this.

Best regards,
Andrew Savchenko

[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* RE: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Rose, Gregory V @ 2012-10-03 19:16 UTC (permalink / raw)
  To: Don Dutile, Duyck, Alexander H
  Cc: Yinghai Lu, Bjorn Helgaas, Greg Kroah-Hartman,
	linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org,
	yuvalmin@broadcom.com, bhutchings@solarflare.com,
	davem@davemloft.net--no-chain-reply-to, Kirsher, Jeffrey T,
	Brandeburg, Jesse, David S. Miller, Fastabend, John R,
	e1000-devel@lists.sourceforge.net, netdev@vger.kernel.org
In-Reply-To: <506C8BCB.4080402@redhat.com>

> -----Original Message-----
> From: Don Dutile [mailto:ddutile@redhat.com]
> Sent: Wednesday, October 03, 2012 12:03 PM
> To: Duyck, Alexander H
> Cc: Yinghai Lu; Bjorn Helgaas; Greg Kroah-Hartman; linux-
> pci@vger.kernel.org; linux-kernel@vger.kernel.org; yuvalmin@broadcom.com;
> bhutchings@solarflare.com; Rose, Gregory V; davem@davemloft.net--no-chain-
> reply-to; Kirsher, Jeffrey T; Brandeburg, Jesse; David S. Miller;
> Fastabend, John R; e1000-devel@lists.sourceforge.net;
> netdev@vger.kernel.org
> Subject: Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
> 
> On 10/03/2012 02:47 PM, Alexander Duyck wrote:

[snip]

> >
> > The ixgbe_set_max_vfs function has several issues.  The two big ones
> > are that this function assumes it can just enable/disable SR-IOV
> > without any other changes being necessary which is not the case.  I
> > would recommend looking at ixgbe_setup_tc for how to do this properly.
> > Secondly is the fact that this code will change the PF network device
> > and as such sections of the code should be called with the RTNL lock
> > held.  In addition I believe you have to disable SR-IOV before
> > enabling it again with a different number of VFs.
> >
> > Below is a link to one of the early patches for igb when we were first
> > introducing SR-IOV, and the in-driver sysfs value had been rejected.
> > I figure it might be useful as it was also using sysfs to
> > enable/disable VFs.  It however doesn't have the correct locking on
> > changing the queues and as such will likely throw an error if you were
> > to implement it the same way now:
> > http://lists.openwall.net/netdev/2009/04/08/34
> >
> > Thanks,
> >
> > Alex
> 
> Alex,
> Thanks for patch set pointer.
> When I started to work on the ixgbe example use based on the RFC set I
> posted, I ran into the problem you outlined -- the PF uses/consumes all
> the queues & MSI intrs when sriov not enabled at driver load time, which
> required more network shutdown logic that I'm not familiar with... So, I
> was going to defer to Greg to work that magic. :)
> Greg: assume the 2 callback function interface in the RFC patch set I
> sent,
>        (primarily, just the include/linux/pci.h changes), and you can make
> the
>        necessary drivers mods from there.  In the meantime, I'll make the
> changes
>        to my original/v1 RFC to reflect the changes that GKH & Yinghai
> recommended/implemented
>        for sysfs attribute creation & removal in a v2 posting.
>        The end result is that the current module parameter setting for
> max_vfs should
>        continue to work, and the sysfs interface will work when those
> pieces are provided.

OK, I'll start work on it.

Thanks Don,

- Greg

> 
> -Don

^ permalink raw reply

* Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Don Dutile @ 2012-10-03 19:02 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Yinghai Lu, Bjorn Helgaas, Greg Kroah-Hartman, linux-pci,
	linux-kernel, yuvalmin, bhutchings, gregory.v.rose, davem,
	Jeff Kirsher, Jesse Brandeburg, David S. Miller, John Fastabend,
	e1000-devel, netdev
In-Reply-To: <506C8837.5070902@intel.com>

On 10/03/2012 02:47 PM, Alexander Duyck wrote:
> On 10/03/2012 10:51 AM, Yinghai Lu wrote:
>> Need ixgbe guys to close the loop to use set_max_vfs instead
>> kernel parameters.
>>
>> Signed-off-by: Yinghai Lu<yinghai@kernel.org>
>> Cc: Jeff Kirsher<jeffrey.t.kirsher@intel.com>
>> Cc: Jesse Brandeburg<jesse.brandeburg@intel.com>
>> Cc: Greg Rose<gregory.v.rose@intel.com>
>> Cc: "David S. Miller"<davem@davemloft.net>
>> Cc: John Fastabend<john.r.fastabend@intel.com>
>> Cc: e1000-devel@lists.sourceforge.net
>> Cc: netdev@vger.kernel.org
>> ---
>>   drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    2 +
>>   drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   44 +++++++++++++++++++-----
>>   2 files changed, 37 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
>> index b9623e9..d39d975 100644
>> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
>> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
>> @@ -558,6 +558,8 @@ struct ixgbe_adapter {
>>   	u32 interrupt_event;
>>   	u32 led_reg;
>>
>> +	struct ixgbe_info *ixgbe_info;
>> +
>>   #ifdef CONFIG_IXGBE_PTP
>>   	struct ptp_clock *ptp_clock;
>>   	struct ptp_clock_info ptp_caps;
>> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>> index ee61819..1c097c7 100644
>> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>> @@ -129,13 +129,6 @@ static struct notifier_block dca_notifier = {
>>   };
>>   #endif
>>
>> -#ifdef CONFIG_PCI_IOV
>> -static unsigned int max_vfs;
>> -module_param(max_vfs, uint, 0);
>> -MODULE_PARM_DESC(max_vfs,
>> -		 "Maximum number of virtual functions to allocate per physical function - default is zero and maximum value is 63");
>> -#endif /* CONFIG_PCI_IOV */
>> -
>>   static unsigned int allow_unsupported_sfp;
>>   module_param(allow_unsupported_sfp, uint, 0);
>>   MODULE_PARM_DESC(allow_unsupported_sfp,
>> @@ -4496,7 +4489,7 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
>>   #ifdef CONFIG_PCI_IOV
>>   	/* assign number of SR-IOV VFs */
>>   	if (hw->mac.type != ixgbe_mac_82598EB)
>> -		adapter->num_vfs = (max_vfs>  63) ? 0 : max_vfs;
>> +		adapter->num_vfs = min_t(int, pdev->max_vfs, 63);
>>
>>   #endif
>>   	/* enable itr by default in dynamic mode */
>> @@ -7220,8 +7213,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
>>
>>   #ifdef CONFIG_PCI_IOV
>>   	ixgbe_enable_sriov(adapter, ii);
>> -
>>   #endif
>> +	adapter->ixgbe_info = ii;
>> +
>>   	netdev->features = NETIF_F_SG |
>>   			   NETIF_F_IP_CSUM |
>>   			   NETIF_F_IPV6_CSUM |
>> @@ -7683,11 +7677,43 @@ static const struct pci_error_handlers ixgbe_err_handler = {
>>   	.resume = ixgbe_io_resume,
>>   };
>>
>> +static void ixgbe_set_max_vfs(struct pci_dev *pdev)
>> +{
>> +#ifdef CONFIG_PCI_IOV
>> +	struct ixgbe_adapter *adapter = pci_get_drvdata(pdev);
>> +	struct ixgbe_hw *hw =&adapter->hw;
>> +	int num_vfs = 0;
>> +
>> +	/* assign number of SR-IOV VFs */
>> +	if (hw->mac.type != ixgbe_mac_82598EB)
>> +		num_vfs = min_t(int, pdev->max_vfs, 63);
>> +
>> +	/* no change */
>> +	if (adapter->num_vfs == num_vfs)
>> +		return;
>> +
>> +	if (!num_vfs) {
>> +		/* disable sriov */
>> +		ixgbe_disable_sriov(adapter);
>> +		adapter->num_vfs = 0;
>> +	} else if (!adapter->num_vfs&&  num_vfs) {
>> +		/* enable sriov */
>> +		adapter->num_vfs = num_vfs;
>> +		ixgbe_enable_sriov(adapter, adapter->ixgbe_info);
>> +	} else {
>> +		/* increase or decrease */
>> +	}
>> +
>> +	pdev->max_vfs = adapter->num_vfs;
>> +#endif
>> +}
>> +
>>   static struct pci_driver ixgbe_driver = {
>>   	.name     = ixgbe_driver_name,
>>   	.id_table = ixgbe_pci_tbl,
>>   	.probe    = ixgbe_probe,
>>   	.remove   = __devexit_p(ixgbe_remove),
>> +	.set_max_vfs = ixgbe_set_max_vfs,
>>   #ifdef CONFIG_PM
>>   	.suspend  = ixgbe_suspend,
>>   	.resume   = ixgbe_resume,
>
> The ixgbe_set_max_vfs function has several issues.  The two big ones are
> that this function assumes it can just enable/disable SR-IOV without any
> other changes being necessary which is not the case.  I would recommend
> looking at ixgbe_setup_tc for how to do this properly.  Secondly is the
> fact that this code will change the PF network device and as such
> sections of the code should be called with the RTNL lock held.  In
> addition I believe you have to disable SR-IOV before enabling it again
> with a different number of VFs.
>
> Below is a link to one of the early patches for igb when we were first
> introducing SR-IOV, and the in-driver sysfs value had been rejected.  I
> figure it might be useful as it was also using sysfs to enable/disable
> VFs.  It however doesn't have the correct locking on changing the queues
> and as such will likely throw an error if you were to implement it the
> same way now:
> http://lists.openwall.net/netdev/2009/04/08/34
>
> Thanks,
>
> Alex

Alex,
Thanks for patch set pointer.
When I started to work on the ixgbe example use based on the RFC set I posted,
I ran into the problem you outlined -- the PF uses/consumes all the queues &
MSI intrs when sriov not enabled at driver load time, which required
more network shutdown logic that I'm not familiar with... So, I was going
to defer to Greg to work that magic. :)
Greg: assume the 2 callback function interface in the RFC patch set I sent,
       (primarily, just the include/linux/pci.h changes), and you can make the
       necessary drivers mods from there.  In the meantime, I'll make the changes
       to my original/v1 RFC to reflect the changes that GKH & Yinghai recommended/implemented
       for sysfs attribute creation & removal in a v2 posting.
       The end result is that the current module parameter setting for max_vfs should
       continue to work, and the sysfs interface will work when those pieces are provided.

-Don

^ permalink raw reply

* Re: [PATCH 19/20] drivers/net/ethernet/marvell/skge.c: fix error return code
From: David Miller @ 2012-10-03 18:48 UTC (permalink / raw)
  To: shemminger; +Cc: peter.senna, mlindner, kernel-janitors, netdev, linux-kernel
In-Reply-To: <20121003092508.6a7da662@nehalam.linuxnetplumber.net>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 3 Oct 2012 09:25:08 -0700

> On Wed,  3 Oct 2012 18:18:10 +0200
> Peter Senna Tschudin <peter.senna@gmail.com> wrote:
> 
>> From: Peter Senna Tschudin <peter.senna@gmail.com>
>> 
>> Convert a nonnegative error return code to a negative one, as returned
>> elsewhere in the function.
>> 
>> A simplified version of the semantic match that finds this problem is as
>> follows: (http://coccinelle.lip6.fr/)
>> 
>> // <smpl>
>> (
>> if@p1 (\(ret < 0\|ret != 0\))
>>  { ... return ret; }
>> |
>> ret@p1 = 0
>> )
>> ... when != ret = e1
>>     when != &ret
>> *if(...)
>> {
>>   ... when != ret = e2
>>       when forall
>>  return ret;
>> }
>> // </smpl>
>> 
>> Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>
>> 
> 
> Thanks for looking into these kind of problems. The contents
> of the patch are correct, but the automated commit message is useless.
> You shouldn't just blindly say what the automated
> script was looking for, you should describe what the bug is so that evaluators
> can decide what the impact is and if it should be backported to stable
> and vendor kernels.

Agreed, I like seeing the checker script but I had that the entire
commit message is automated and has no human analysis or somments.

^ permalink raw reply

* Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Alexander Duyck @ 2012-10-03 18:47 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: e1000-devel, Greg Kroah-Hartman, linux-kernel, Jesse Brandeburg,
	John Fastabend, yuvalmin, netdev, Don Dutile, linux-pci,
	Bjorn Helgaas, bhutchings, David S. Miller, davem
In-Reply-To: <1349286695-26713-6-git-send-email-yinghai@kernel.org>

On 10/03/2012 10:51 AM, Yinghai Lu wrote:
> Need ixgbe guys to close the loop to use set_max_vfs instead
> kernel parameters.
>
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Cc: Greg Rose <gregory.v.rose@intel.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: John Fastabend <john.r.fastabend@intel.com>
> Cc: e1000-devel@lists.sourceforge.net
> Cc: netdev@vger.kernel.org
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    2 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   44 +++++++++++++++++++-----
>  2 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> index b9623e9..d39d975 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> @@ -558,6 +558,8 @@ struct ixgbe_adapter {
>  	u32 interrupt_event;
>  	u32 led_reg;
>  
> +	struct ixgbe_info *ixgbe_info;
> +
>  #ifdef CONFIG_IXGBE_PTP
>  	struct ptp_clock *ptp_clock;
>  	struct ptp_clock_info ptp_caps;
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index ee61819..1c097c7 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -129,13 +129,6 @@ static struct notifier_block dca_notifier = {
>  };
>  #endif
>  
> -#ifdef CONFIG_PCI_IOV
> -static unsigned int max_vfs;
> -module_param(max_vfs, uint, 0);
> -MODULE_PARM_DESC(max_vfs,
> -		 "Maximum number of virtual functions to allocate per physical function - default is zero and maximum value is 63");
> -#endif /* CONFIG_PCI_IOV */
> -
>  static unsigned int allow_unsupported_sfp;
>  module_param(allow_unsupported_sfp, uint, 0);
>  MODULE_PARM_DESC(allow_unsupported_sfp,
> @@ -4496,7 +4489,7 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
>  #ifdef CONFIG_PCI_IOV
>  	/* assign number of SR-IOV VFs */
>  	if (hw->mac.type != ixgbe_mac_82598EB)
> -		adapter->num_vfs = (max_vfs > 63) ? 0 : max_vfs;
> +		adapter->num_vfs = min_t(int, pdev->max_vfs, 63);
>  
>  #endif
>  	/* enable itr by default in dynamic mode */
> @@ -7220,8 +7213,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
>  
>  #ifdef CONFIG_PCI_IOV
>  	ixgbe_enable_sriov(adapter, ii);
> -
>  #endif
> +	adapter->ixgbe_info = ii;
> +
>  	netdev->features = NETIF_F_SG |
>  			   NETIF_F_IP_CSUM |
>  			   NETIF_F_IPV6_CSUM |
> @@ -7683,11 +7677,43 @@ static const struct pci_error_handlers ixgbe_err_handler = {
>  	.resume = ixgbe_io_resume,
>  };
>  
> +static void ixgbe_set_max_vfs(struct pci_dev *pdev)
> +{
> +#ifdef CONFIG_PCI_IOV
> +	struct ixgbe_adapter *adapter = pci_get_drvdata(pdev);
> +	struct ixgbe_hw *hw = &adapter->hw;
> +	int num_vfs = 0;
> +
> +	/* assign number of SR-IOV VFs */
> +	if (hw->mac.type != ixgbe_mac_82598EB)
> +		num_vfs = min_t(int, pdev->max_vfs, 63);
> +
> +	/* no change */
> +	if (adapter->num_vfs == num_vfs)
> +		return;
> +
> +	if (!num_vfs) {
> +		/* disable sriov */
> +		ixgbe_disable_sriov(adapter);
> +		adapter->num_vfs = 0;
> +	} else if (!adapter->num_vfs && num_vfs) {
> +		/* enable sriov */
> +		adapter->num_vfs = num_vfs;
> +		ixgbe_enable_sriov(adapter, adapter->ixgbe_info);
> +	} else {
> +		/* increase or decrease */
> +	}
> +
> +	pdev->max_vfs = adapter->num_vfs;
> +#endif
> +}
> +
>  static struct pci_driver ixgbe_driver = {
>  	.name     = ixgbe_driver_name,
>  	.id_table = ixgbe_pci_tbl,
>  	.probe    = ixgbe_probe,
>  	.remove   = __devexit_p(ixgbe_remove),
> +	.set_max_vfs = ixgbe_set_max_vfs,
>  #ifdef CONFIG_PM
>  	.suspend  = ixgbe_suspend,
>  	.resume   = ixgbe_resume,

The ixgbe_set_max_vfs function has several issues.  The two big ones are
that this function assumes it can just enable/disable SR-IOV without any
other changes being necessary which is not the case.  I would recommend
looking at ixgbe_setup_tc for how to do this properly.  Secondly is the
fact that this code will change the PF network device and as such
sections of the code should be called with the RTNL lock held.  In
addition I believe you have to disable SR-IOV before enabling it again
with a different number of VFs.

Below is a link to one of the early patches for igb when we were first
introducing SR-IOV, and the in-driver sysfs value had been rejected.  I
figure it might be useful as it was also using sysfs to enable/disable
VFs.  It however doesn't have the correct locking on changing the queues
and as such will likely throw an error if you were to implement it the
same way now:
http://lists.openwall.net/netdev/2009/04/08/34

Thanks,

Alex

------------------------------------------------------------------------------
Don't let slow site performance ruin your business. Deploy New Relic APM
Deploy New Relic app performance management and know exactly
what is happening inside your Ruby, Python, PHP, Java, and .NET app
Try New Relic at no cost today and get our sweet Data Nerd shirt too!
http://p.sf.net/sfu/newrelic-dev2dev
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Dan Carpenter @ 2012-10-03 18:45 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: e1000-devel, Greg Kroah-Hartman, linux-kernel, Jesse Brandeburg,
	John Fastabend, yuvalmin, netdev, Don Dutile, linux-pci,
	Bjorn Helgaas, bhutchings, David S. Miller, davem
In-Reply-To: <1349286695-26713-6-git-send-email-yinghai@kernel.org>

On Wed, Oct 03, 2012 at 10:51:35AM -0700, Yinghai Lu wrote:
> Need ixgbe guys to close the loop to use set_max_vfs instead
> kernel parameters.
> 
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Cc: Greg Rose <gregory.v.rose@intel.com>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: John Fastabend <john.r.fastabend@intel.com>
> Cc: e1000-devel@lists.sourceforge.net
> Cc: netdev@vger.kernel.org
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    2 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   44 +++++++++++++++++++-----
>  2 files changed, 37 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> index b9623e9..d39d975 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
> @@ -558,6 +558,8 @@ struct ixgbe_adapter {
>  	u32 interrupt_event;
>  	u32 led_reg;
>  
> +	struct ixgbe_info *ixgbe_info;
> +
>  #ifdef CONFIG_IXGBE_PTP
>  	struct ptp_clock *ptp_clock;
>  	struct ptp_clock_info ptp_caps;
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index ee61819..1c097c7 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -129,13 +129,6 @@ static struct notifier_block dca_notifier = {
>  };
>  #endif
>  
> -#ifdef CONFIG_PCI_IOV
> -static unsigned int max_vfs;
> -module_param(max_vfs, uint, 0);
> -MODULE_PARM_DESC(max_vfs,
> -		 "Maximum number of virtual functions to allocate per physical function - default is zero and maximum value is 63");
> -#endif /* CONFIG_PCI_IOV */
> -
>  static unsigned int allow_unsupported_sfp;
>  module_param(allow_unsupported_sfp, uint, 0);
>  MODULE_PARM_DESC(allow_unsupported_sfp,
> @@ -4496,7 +4489,7 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
>  #ifdef CONFIG_PCI_IOV
>  	/* assign number of SR-IOV VFs */
>  	if (hw->mac.type != ixgbe_mac_82598EB)
> -		adapter->num_vfs = (max_vfs > 63) ? 0 : max_vfs;
> +		adapter->num_vfs = min_t(int, pdev->max_vfs, 63);

Could we make this min_t(uint, ...);

->max_vfs is type unsigned int. We take an unsigned long from sysfs.
We silently truncate it to an unsigned int.  Then we cast it to a
negative number and compare against 63 and take the minimum...

It's root only so it's not a problem but it's a hassle to audit.

regards,
dan carpenter


------------------------------------------------------------------------------
Don't let slow site performance ruin your business. Deploy New Relic APM
Deploy New Relic app performance management and know exactly
what is happening inside your Ruby, Python, PHP, Java, and .NET app
Try New Relic at no cost today and get our sweet Data Nerd shirt too!
http://p.sf.net/sfu/newrelic-dev2dev
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: drivers/net/cris/eth_v10.c:1715:2: error: too many arguments to function 'e100rxtx_interrupt'
From: David Miller @ 2012-10-03 18:40 UTC (permalink / raw)
  To: jesper.nilsson; +Cc: fengguang.wu, kernel-janitors, netdev
In-Reply-To: <20121003104652.GK1390@axis.com>

From: Jesper Nilsson <jesper.nilsson@axis.com>
Date: Wed, 3 Oct 2012 12:46:52 +0200

> On Fri, Sep 28, 2012 at 03:06:08PM +0200, Fengguang Wu wrote:
>> Hi Jesper,
> 
> Hi!
> 
>> FYI, a rather old build bug that's introduced by
>> 
>> bafef0a cris build fixes: update eth_v10.c ethernet driver
>> 
>> All error/warnings:
>> 
>> drivers/net/cris/eth_v10.c: In function 'e100_netpoll':
>> drivers/net/cris/eth_v10.c:1715:2: error: too many arguments to function 'e100rxtx_interrupt'
>> drivers/net/cris/eth_v10.c:1131:1: note: declared here
> 
> Yep, I can't figure out why the followup patches never reached mainline,
> but we have fixes for exactly that in our in-house tree.
> I'll push some move patches after this merge window.

It's a bug fix, even worse a build fix, why want until after the merge
window?

^ permalink raw reply

* Re: [Patch net-next] netpoll: call ->ndo_select_queue() in tx path
From: David Miller @ 2012-10-03 18:39 UTC (permalink / raw)
  To: s.munaut; +Cc: amwang, netdev, edumazet
In-Reply-To: <CAF6-1L5Gp9kpV+Koru6uAmYxQg_WQav9RVD4MVBDa2UGSoPOCA@mail.gmail.com>

From: Sylvain Munaut <s.munaut@whatever-company.com>
Date: Wed, 3 Oct 2012 11:33:22 +0200

> Hi,
> 
>>> In netpoll tx path, we miss the chance of calling ->ndo_select_queue(),
>>> thus could cause problems when bonding is involved.
>>>
>>> This patch makes dev_pick_tx() extern (and rename it to netdev_pick_tx())
>>> to let netpoll call it in netpoll_send_skb_on_dev().
>>>
>>> Reported-by: Sylvain Munaut <s.munaut@whatever-company.com>
>>> Cc: "David S. Miller" <davem@davemloft.net>
>>> Cc: Eric Dumazet <edumazet@google.com>
>>> Signed-off-by: Cong Wang <amwang@redhat.com>
>>> Tested-by: Sylvain Munaut <s.munaut@whatever-company.com>
>>
>> Applied, thanks.
> 
> Huh, I don't see it in the final 3.6 ?
> That's rather inconvenient :(

What part of "net-next" in the subject line do you not understand?

^ permalink raw reply

* Re: [PATCH net-next v2] netxen: write IP address to firmware when using bonding
From: David Miller @ 2012-10-03 18:38 UTC (permalink / raw)
  To: nikolay; +Cc: sony.chacko, agospoda, rajesh.borundia, netdev
In-Reply-To: <506C0346.8010708@redhat.com>

From: Nikolay Aleksandrov <nikolay@redhat.com>
Date: Wed, 03 Oct 2012 11:20:06 +0200

> I just synced with upstream, I've missed a few patches and it seems
> that it doesn't apply cleanly because my previous patch was
> changed before it was applied. There is one character missing from
> a comment - "/* root bus? */", in upstream it was changed to
> /* root bus */.
> ("netxen: check for root bus in netxen_mask_aer_correctable")
> 
> About the rest, after QLogic test the functionality I'll clean-up the
> empty lines and re-send it.

Can you please not quote an entire patch just to make a small series
of comments?

Just quote the exact relevant portions of the patch if you want to
specifically make comments about something.

Quoting the entire patch is extremely bad netiquette, wastes
bandwidth, and everyone on the list has to receive a whole new copy of
the entire patch again for no good reason.

^ permalink raw reply

* Re: [PATCH 5/5] ixgbe: add driver set_max_vfs support
From: Yinghai Lu @ 2012-10-03 17:57 UTC (permalink / raw)
  To: Bjorn Helgaas, Greg Kroah-Hartman
  Cc: linux-pci, linux-kernel, Don Dutile, yuvalmin, bhutchings,
	gregory.v.rose, Yinghai Lu, Jeff Kirsher, Jesse Brandeburg,
	David S. Miller, John Fastabend, e1000-devel, netdev
In-Reply-To: <1349286695-26713-6-git-send-email-yinghai@kernel.org>

On Wed, Oct 3, 2012 at 10:51 AM, Yinghai Lu <yinghai@kernel.org> wrote:
> Need ixgbe guys to close the loop to use set_max_vfs instead
> kernel parameters.

Sorry, I should put RFC in the subject line for this one.

Thanks

Yinghai

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox