Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH ethtool] ethtool: merge ETHTOOL_[GS]FEATURES support to -k/-K modes
From: Michał Mirosław @ 2011-05-17 20:33 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, David Miller
In-Reply-To: <20110517084543.GB18423@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---

This depends on the previous patch introducing -w/-W for [GS]FEATURES.

BTW, I noticed an old bug in ethtool (present currently and in debian-lenny's
version 6+20080913-1): "ethtool -k" -- i.e. with no other parameters -- runs
and tries to check device named '-k'.

Example:

icybox:~# ./ethtool -k ge0
Offload parameters for ge0:
rx-checksumming: on
tx-checksumming: off
scatter-gather: off
tcp-segmentation-offload: off
udp-fragmentation-offload: off
generic-segmentation-offload: off
generic-receive-offload: on
large-receive-offload: off
rx-vlan-offload: off
tx-vlan-offload: off
ntuple-filters: off
receive-hashing: off

Full offload state:  (feature-name: active,wanted,changable)
     tx-scatter-gather                 no,yes,yes
     tx-checksum-ipv4                  no, no,yes
     tx-checksum-unneeded              no,---, no
     tx-checksum-ip-generic            no,---, no
     tx_checksum-ipv6                  no, no,yes
     highdma                           no,---, no
     tx-scatter-gather-fraglist        no,---, no
     tx-vlan-hw-insert                 no,---, no
     rx-vlan-hw-parse                  no,---, no
     rx-vlan-filter                    no,---, no
     vlan-challenged                   no,---,---
     tx-generic-segmentation           no,yes,yes
     tx-lockless                       no,---,---
     netns-local                       no,---,---
     rx-gro                           yes,yes,yes
     rx-lro                            no,---, no
     tx-tcp-segmentation               no, no,yes
     tx-udp-fragmentation              no,---, no
     tx-gso-robust                     no,---, no
     tx-tcp-ecn-segmentation           no, no,yes
     tx-tcp6-segmentation              no, no,yes
     tx-fcoe-segmentation              no,---, no
     tx-checksum-fcoe-crc              no,---, no
     tx-checksum-sctp                  no,---, no
     fcoe-mtu                          no,---, no
     rx-ntuple-filter                  no,---, no
     rx-hashing                        no,---, no
     rx-checksum                      yes,yes,yes
     tx-nocache-copy                   no, no,yes
     loopback                          no,---, no

icybox:~# ./ethtool -K ge0 tx_checksum-ipv6 on
feature tx-scatter-gather is enabled (expected: disabled, saved: enabled)
feature tx-generic-segmentation is enabled (expected: disabled, saved: enabled)

icybox:~# ./ethtool -K ge0 sg off
[turns off SG and GSO; like before]

icybox:~# ./ethtool -K ge0 tx_checksum-ipv6 off
[no other feature changed state]

icybox:~# ./ethtool -K ge0 sg on
[SG was remembered this time, but is inactive (no checksum offloads)]

icybox:~# ./ethtool -K ge0 tx_checksum-ipv6 on
feature tx-scatter-gather is enabled (expected: disabled, saved: enabled)
feature tx-generic-segmentation is enabled (expected: disabled, saved: enabled)

icybox:~# ./ethtool -K ge0 tx_checksum-ipv6 off
feature tx-scatter-gather is disabled (expected: enabled, saved: enabled)
feature tx-generic-segmentation is disabled (expected: enabled, saved: enabled)

---
 ethtool.c |  155 ++++++++++++++++++++++++++++--------------------------------
 1 files changed, 72 insertions(+), 83 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index 86a5a8b..a541007 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -123,8 +123,6 @@ static enum {
 	MODE_SRING,
 	MODE_GOFFLOAD,
 	MODE_SOFFLOAD,
-	MODE_GFEATURES,
-	MODE_SFEATURES,
 	MODE_GSTATS,
 	MODE_GNFC,
 	MODE_SNFC,
@@ -202,9 +200,6 @@ static struct option {
 		"		[ txvlan on|off ]\n"
 		"		[ ntuple on|off ]\n"
 		"		[ rxhash on|off ]\n"
-    },
-    { "-w", "--show-features", MODE_GFEATURES, "Get offload status" },
-    { "-W", "--request-features", MODE_SFEATURES, "Set requested offload",
 		"		[ feature-name on|off [...] ]\n"
 		"		see --show-features output for feature-name strings\n" },
     { "-i", "--driver", MODE_GDRV, "Show driver information" },
@@ -306,7 +301,6 @@ static void show_usage(void)
 
 static char *devname = NULL;
 
-static int goffload_changed = 0;
 static int off_csum_rx_wanted = -1;
 static int off_csum_tx_wanted = -1;
 static int off_sg_wanted = -1;
@@ -316,6 +310,7 @@ static int off_gso_wanted = -1;
 static u32 off_flags_wanted = 0;
 static u32 off_flags_mask = 0;
 static int off_gro_wanted = -1;
+static struct ethtool_sfeatures *features_req;
 
 static struct ethtool_pauseparam epause;
 static int gpause_changed = 0;
@@ -778,8 +773,6 @@ static void parse_cmdline(int argc, char **argp)
 			    (mode == MODE_SRING) ||
 			    (mode == MODE_GOFFLOAD) ||
 			    (mode == MODE_SOFFLOAD) ||
-			    (mode == MODE_GFEATURES) ||
-			    (mode == MODE_SFEATURES) ||
 			    (mode == MODE_GSTATS) ||
 			    (mode == MODE_GNFC) ||
 			    (mode == MODE_SNFC) ||
@@ -863,14 +856,6 @@ static void parse_cmdline(int argc, char **argp)
 				break;
 			}
 			if (mode == MODE_SOFFLOAD) {
-				parse_generic_cmdline(argc, argp, i,
-					&goffload_changed,
-			      		cmdline_offload,
-			      		ARRAY_SIZE(cmdline_offload));
-				i = argc;
-				break;
-			}
-			if (mode == MODE_SFEATURES) {
 				parse_sfeatures_args(argc, argp, i);
 				i = argc;
 				break;
@@ -1944,10 +1929,6 @@ static int doit(void)
 		return do_goffload(fd, &ifr);
 	} else if (mode == MODE_SOFFLOAD) {
 		return do_soffload(fd, &ifr);
-	} else if (mode == MODE_GFEATURES) {
-		return do_gfeatures(fd, &ifr);
-	} else if (mode == MODE_SFEATURES) {
-		return do_sfeatures(fd, &ifr);
 	} else if (mode == MODE_GSTATS) {
 		return do_gstats(fd, &ifr);
 	} else if (mode == MODE_GNFC) {
@@ -2262,13 +2243,20 @@ static int do_goffload(int fd, struct ifreq *ifr)
 		allfail = 0;
 	}
 
+	if (!allfail)
+		dump_offload(rx, tx, sg, tso, ufo, gso, gro, lro, rxvlan, txvlan,
+			    ntuple, rxhash);
+
+	err = do_gfeatures(fd, ifr);
+	if (!err)
+		allfail = 0;
+
 	if (allfail) {
 		fprintf(stdout, "no offload info available\n");
 		return 83;
 	}
 
-	return dump_offload(rx, tx, sg, tso, ufo, gso, gro, lro, rxvlan, txvlan,
-			    ntuple, rxhash);
+	return 0;
 }
 
 static int do_soffload(int fd, struct ifreq *ifr)
@@ -2277,116 +2265,114 @@ static int do_soffload(int fd, struct ifreq *ifr)
 	int err, changed = 0;
 
 	if (off_csum_rx_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_SRXCSUM;
 		eval.data = (off_csum_rx_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = send_ioctl(fd, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device rx csum settings");
-			return 84;
-		}
+		else
+			changed = 1;
 	}
 
 	if (off_csum_tx_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_STXCSUM;
 		eval.data = (off_csum_tx_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = send_ioctl(fd, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device tx csum settings");
-			return 85;
-		}
+		else
+			changed = 1;
 	}
 
 	if (off_sg_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_SSG;
 		eval.data = (off_sg_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = send_ioctl(fd, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device scatter-gather settings");
-			return 86;
-		}
+		else
+			changed = 1;
 	}
 
 	if (off_tso_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_STSO;
 		eval.data = (off_tso_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = send_ioctl(fd, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device tcp segmentation offload settings");
-			return 88;
-		}
+		else
+			changed = 1;
 	}
 	if (off_ufo_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_SUFO;
 		eval.data = (off_ufo_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = ioctl(fd, SIOCETHTOOL, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device udp large send offload settings");
-			return 89;
-		}
+		else
+			changed = 1;
 	}
 	if (off_gso_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_SGSO;
 		eval.data = (off_gso_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = ioctl(fd, SIOCETHTOOL, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device generic segmentation offload settings");
-			return 90;
-		}
+		else
+			changed = 1;
 	}
 	if (off_flags_mask) {
-		changed = 1;
 		eval.cmd = ETHTOOL_GFLAGS;
 		eval.data = 0;
 		ifr->ifr_data = (caddr_t)&eval;
 		err = ioctl(fd, SIOCETHTOOL, ifr);
 		if (err) {
 			perror("Cannot get device flag settings");
-			return 91;
-		}
+		} else {
+			eval.cmd = ETHTOOL_SFLAGS;
+			eval.data = ((eval.data & ~off_flags_mask) |
+				     off_flags_wanted);
 
-		eval.cmd = ETHTOOL_SFLAGS;
-		eval.data = ((eval.data & ~off_flags_mask) |
-			     off_flags_wanted);
-
-		err = ioctl(fd, SIOCETHTOOL, ifr);
-		if (err) {
-			perror("Cannot set device flag settings");
-			return 92;
+			err = ioctl(fd, SIOCETHTOOL, ifr);
+			if (err)
+				perror("Cannot set device flag settings");
+			else
+				changed = 1;
 		}
 	}
 	if (off_gro_wanted >= 0) {
-		changed = 1;
 		eval.cmd = ETHTOOL_SGRO;
 		eval.data = (off_gro_wanted == 1);
 		ifr->ifr_data = (caddr_t)&eval;
 		err = ioctl(fd, SIOCETHTOOL, ifr);
-		if (err) {
+		if (err)
 			perror("Cannot set device GRO settings");
-			return 93;
-		}
+		else
+			changed = 1;
+	}
+
+	if (features_req) {
+		err = do_sfeatures(fd, ifr);
+		if (!err)
+			changed = 1;
 	}
 
 	if (!changed) {
 		fprintf(stdout, "no offload settings changed\n");
+		return err;
 	}
 
 	return 0;
 }
 
 static int get_feature_strings(int fd, struct ifreq *ifr,
-	struct ethtool_gstrings **strs)
+	struct ethtool_gstrings **strs, int quiet_nx)
 {
 	struct ethtool_sset_info *sset_info;
 	struct ethtool_gstrings *strings;
@@ -2398,16 +2384,18 @@ static int get_feature_strings(int fd, struct ifreq *ifr,
 	ifr->ifr_data = (caddr_t)sset_info;
 	err = send_ioctl(fd, ifr);
 
-	if ((err < 0) ||
-	    (!(sset_info->sset_mask & (1ULL << ETH_SS_FEATURES)))) {
-		perror("Cannot get driver strings info");
-		return -100;
-	}
-
 	n_strings = sset_info->data[0];
 	free(sset_info);
+
+	if ((err < 0) ||
+	    (!(sset_info->sset_mask & (1ULL << ETH_SS_FEATURES))) ||
+	    (n_strings == 0)) {
+		if (!quiet_nx)
+			perror("Cannot get driver strings info");
+		return -100;
+	}
+
 	sz_str = n_strings * ETH_GSTRING_LEN;
-
 	strings = calloc(1, sz_str + sizeof(struct ethtool_gstrings));
 	if (!strings) {
 		fprintf(stderr, "no memory available\n");
@@ -2429,8 +2417,6 @@ static int get_feature_strings(int fd, struct ifreq *ifr,
 	return n_strings;
 }
 
-struct ethtool_sfeatures *features_req;
-
 static void parse_sfeatures_args(int argc, char **argp, int argi)
 {
 	struct cmdline_info *cmdline_desc, *cp;
@@ -2443,13 +2429,21 @@ static void parse_sfeatures_args(int argc, char **argp, int argi)
 	if (fd < 0)
 		exit(100);
 
-	n_strings = get_feature_strings(fd, &ifr, &strings);
-	if (n_strings < 0)
-		exit(-n_strings);
+	n_strings = get_feature_strings(fd, &ifr, &strings, 1);
+	if (n_strings < 0) {
+		/* ETHTOOL_GFEATURES unavailable */
+		parse_generic_cmdline(argc, argp, argi, &changed,
+			cmdline_offload, ARRAY_SIZE(cmdline_offload));
+		return;
+	}
 
 	sz_features = sizeof(*features_req->features) * ((n_strings + 31) / 32);
 
-	cp = cmdline_desc = calloc(n_strings, sizeof(*cmdline_desc));
+	cp = cmdline_desc = calloc(n_strings + ARRAY_SIZE(cmdline_offload),
+		sizeof(*cmdline_desc));
+	memcpy(cp, cmdline_offload, sizeof(cmdline_offload));
+	cp += ARRAY_SIZE(cmdline_offload);
+
 	features_req = calloc(1, sizeof(*features_req) + sz_features);
 	if (!cmdline_desc || !features_req) {
 		fprintf(stderr, "no memory available\n");
@@ -2518,7 +2512,7 @@ static int do_gfeatures(int fd, struct ifreq *ifr)
 	struct ethtool_gfeatures *features;
 	int n_strings, err, i;
 
-	n_strings = get_feature_strings(fd, ifr, &strings);
+	n_strings = get_feature_strings(fd, ifr, &strings, 1);
 	if (n_strings < 0)
 		return -n_strings;
 
@@ -2528,7 +2522,7 @@ static int do_gfeatures(int fd, struct ifreq *ifr)
 		return err;
 	}
 
-	fprintf(stdout, "Offload state:  (name: enabled,wanted,changable)\n");
+	fprintf(stdout, "\nFull offload state:  (feature-name: active,wanted,changable)\n");
 	for (i = 0; i < n_strings; i++) {
 		if (!strings->data[i * ETH_GSTRING_LEN])
 			continue;	/* empty */
@@ -2585,12 +2579,7 @@ static int do_sfeatures(int fd, struct ifreq *ifr)
 	struct ethtool_gfeatures *features0, *features1;
 	int n_strings, err, i;
 
-	if (!features_req) {
-		fprintf(stderr, "no features changed\n");
-		return 97;
-	}
-
-	n_strings = get_feature_strings(fd, ifr, &strings);
+	n_strings = get_feature_strings(fd, ifr, &strings, 0);
 	if (n_strings < 0) {
 		free(features_req);
 		return -n_strings;
-- 
1.7.2.5


^ permalink raw reply related

* [PATCH] net: ethtool: fix IPV6 checksum feature name string
From: Michał Mirosław @ 2011-05-17 20:40 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings, David Miller

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
This is against net-next but shoud go also to net.

 net/core/ethtool.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ed6c0d4..59255c1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -355,7 +355,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
 	/* NETIF_F_IP_CSUM */         "tx-checksum-ipv4",
 	/* NETIF_F_NO_CSUM */         "tx-checksum-unneeded",
 	/* NETIF_F_HW_CSUM */         "tx-checksum-ip-generic",
-	/* NETIF_F_IPV6_CSUM */       "tx_checksum-ipv6",
+	/* NETIF_F_IPV6_CSUM */       "tx-checksum-ipv6",
 	/* NETIF_F_HIGHDMA */         "highdma",
 	/* NETIF_F_FRAGLIST */        "tx-scatter-gather-fraglist",
 	/* NETIF_F_HW_VLAN_TX */      "tx-vlan-hw-insert",
-- 
1.7.2.5


^ permalink raw reply related

* [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test
From: Shirley Ma @ 2011-05-17 20:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <1305646444.10756.16.camel@localhost.localdomain>

Hello Michael,

Here is the patch I used to test out of order before: add used in a pend
array, and swap the last two ids. 

I used to hit an issue, but now it seems working well.

This won't impact zero-copy patch since we need to maintain the pend
used ids anyway.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   24 +++++++++++++++++++++++-
 drivers/vhost/vhost.c |   11 +++++++++++
 drivers/vhost/vhost.h |    1 +
 3 files changed, 35 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..19e1baa 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,8 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+#define VHOST_MAX_PEND	128
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -198,13 +200,33 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		vq->heads[vq->pend_idx].id = head;
+		vq->heads[vq->pend_idx].len = 0;
+		++vq->pend_idx;
+		if (vq->pend_idx >= VHOST_MAX_PEND) {
+			int id;
+			id = vq->heads[vq->pend_idx-1].id;
+			vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id; 
+			vq->heads[vq->pend_idx-2].id = id;
+			vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+						    vq->pend_idx);
+			vq->pend_idx = 0;
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
 	}
+	if (vq->pend_idx >= VHOST_MAX_PEND) {
+		int id;
+		id = vq->heads[vq->pend_idx-1].id;
+		vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id;
+		vq->heads[vq->pend_idx-2].id = id;
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    vq->pend_idx);
+		vq->pend_idx = 0;
+	}
 
 	mutex_unlock(&vq->mutex);
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..7eea6b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->pend_idx = 0;
 }
 
 static int vhost_worker(void *data)
@@ -395,6 +396,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		if (dev->vqs[i].pend_idx != 0) {
+			vhost_add_used_and_signal_n(dev, &dev->vqs[i],
+				dev->vqs[i].heads, dev->vqs[i].pend_idx);
+			dev->vqs[i].pend_idx = 0;
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -603,6 +609,11 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	if (vq->pend_idx != 0) {
+		vhost_add_used_and_signal_n(d, vq, vq->heads, vq->pend_idx);
+		vq->pend_idx = 0;
+	}
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..44a412d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,7 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	int pend_idx;
 };
 
 struct vhost_dev {

^ permalink raw reply related

* Re: small RPS cache for fragments?
From: David Miller @ 2011-05-17 20:47 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1305663288.2691.2.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 May 2011 22:14:48 +0200

> OK but do we have workloads actually needing this optimization at all ?

Yes, I've seen performance graphs where RPS/RFS falls off the cliff
when datagram sizes go from 1024 to 2048 bytes.

Wrt. defrag queue overhead, it still is minor compared to the cost of
processing 1/2 of all packets on one cpu on a 24 core system.

BTW, if we can steer reliably, we could make per-cpu defrag queue if
you worry about it so much :-)

^ permalink raw reply

* Re: small RPS cache for fragments?
From: Rick Jones @ 2011-05-17 20:41 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, netdev
In-Reply-To: <1305663434.8149.936.camel@tardy>

On Tue, 2011-05-17 at 13:17 -0700, Rick Jones wrote:
> On Tue, 2011-05-17 at 13:02 -0700, Tom Herbert wrote:
> > I like it!  And this sounds like the sort of algorithm that NICs might
> > be able to implement to solve the UDP/RSS unpleasantness, so even
> > better.
> 
> Do (m)any devices take "shortcuts" with UDP datagrams these days?  By
> that I mean that back in the day, the HP-PB and "Slider" FDDI
> cards/drivers did checksum offload for fragmented UDP datagrams by
> sending the first fragment, the one with the UDP header and thus
> checksum, last.  It did that to save space on the card and make use of
> the checksum accumulator.

Even if no devices (mis)behave like that today, ordering of fragments
sent via a mode-rr bond is far from a sure thing.

rick

> 
> rick jones
> 
> > 
> > Tom
> > 
> > On Tue, May 17, 2011 at 11:33 AM, David Miller <davem@davemloft.net> wrote:
> > >
> > > It seems to me that we can solve the UDP fragmentation problem for
> > > flow steering very simply by creating a (saddr/daddr/IPID) entry in a
> > > table that maps to the corresponding RPS flow entry.
> > >
> > > When we see the initial frag with the UDP header, we create the
> > > saddr/daddr/IPID mapping, and we tear it down when we hit the
> > > saddr/daddr/IPID mapping and the packet has the IP_MF bit clear.
> > >
> > > We only inspect the saddr/daddr/IPID cache when iph->frag_off is
> > > non-zero.
> > >
> > > It's best effort and should work quite well.
> > >
> > > Even a one-behind cache, per-NAPI instance, would do a lot better than
> > > what happens at the moment.  Especially since the IP fragments mostly
> > > arrive as one packet train.



^ permalink raw reply

* Re: small RPS cache for fragments?
From: David Miller @ 2011-05-17 20:49 UTC (permalink / raw)
  To: therbert; +Cc: netdev
In-Reply-To: <BANLkTikyH=q_6uOvFh3_Z_xwPST3zVijZw@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Tue, 17 May 2011 13:02:25 -0700

> I like it!  And this sounds like the sort of algorithm that NICs might
> be able to implement to solve the UDP/RSS unpleasantness, so even
> better.

Actually, I think it won't work.  Even Linux emits fragments last to
first, so we won't see the UDP header until the last packet where it's
no longer useful.

Back to the drawing board. :-/

^ permalink raw reply

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
From: Shirley Ma @ 2011-05-17 20:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <1305646444.10756.16.camel@localhost.localdomain>

Resubmit the patch with most update. This patch passed some
live-migration test against RHEL6.2. I will run more stress test w/i
live migration.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |   12 ++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..6bd6e28 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,9 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_ZEROCOPY_PEND 128 
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
+			vhost_zerocopy_signal_used(vq, false);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
+			    (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len <= 128)
+				vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
+			else {
+				vq->heads[vq->upend_idx].len = len;
+				pend.callback = vhost_zerocopy_callback;
+				pend.arg = vq;
+				pend.desc = vq->upend_idx;
+				msg.msg_control = &pend;
+				msg.msg_controllen = sizeof(pend);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq, 1);
+			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+				vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..ce799d6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 					       UIO_MAXIOV, GFP_KERNEL);
 		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
 					  GFP_KERNEL);
-		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
 					    UIO_MAXIOV, GFP_KERNEL);
 
 		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
@@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* 
+	comments
+*/
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+	int i, j = 0;
+
+	i = vq->done_idx;
+	while (i != vq->upend_idx) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+			/* reset len = 0 */
+			vq->heads[i].len = 0;
+			i = (i + 1) % UIO_MAXIOV;
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		/* comments */
+		if (i > vq->done_idx)
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+		else {
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+					 UIO_MAXIOV - vq->done_idx);
+			vhost_add_used_n(vq, vq->heads, i);
+		}
+		vq->done_idx = i;
+		vhost_signal(vq->dev, vq);
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
@@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* wait for all lower device DMAs done, then notify guest */
+		if (atomic_read(&dev->vqs[i].refcnt)) {
+			msleep(1000);
+			vhost_zerocopy_signal_used(&dev->vqs[i], true);
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 
 	mutex_lock(&vq->mutex);
 
+	/* force all lower device DMAs done */
+	if (atomic_read(&vq->refcnt)) 
+		vhost_zerocopy_signal_used(vq, true);
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		/* Resizing ring with an active backend?
@@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..8e3ecc7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,10 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+
 struct vhost_device;
 
 struct vhost_work;
@@ -108,6 +112,12 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* copy of used idx to monintor DMA done zerocopy buffers */
+	int done_idx;
 };
 
 struct vhost_dev {
@@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \

^ permalink raw reply related

* Re: [PATCH] net: ethtool: fix IPV6 checksum feature name string
From: David Miller @ 2011-05-17 20:50 UTC (permalink / raw)
  To: mirq-linux; +Cc: netdev, bhutchings
In-Reply-To: <20110517204015.2F4C013A6A@rere.qmqm.pl>

From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Tue, 17 May 2011 22:40:15 +0200 (CEST)

> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>

Applied, thanks.

^ permalink raw reply

* Re: [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test
From: Michael S. Tsirkin @ 2011-05-17 20:52 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <1305665181.10756.29.camel@localhost.localdomain>

On Tue, May 17, 2011 at 01:46:21PM -0700, Shirley Ma wrote:
> Hello Michael,
> 
> Here is the patch I used to test out of order before: add used in a pend
> array, and swap the last two ids. 
> 
> I used to hit an issue, but now it seems working well.

Aha, so if I apply this guest will *not* crash? :)

> This won't impact zero-copy patch since we need to maintain the pend
> used ids anyway.

Yes, but now we can mark them used immediately as they are
completed - and I am guessing this will relieve the
pressure on tx ring that you see?

> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>  drivers/vhost/net.c   |   24 +++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   11 +++++++++++
>  drivers/vhost/vhost.h |    1 +
>  3 files changed, 35 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..19e1baa 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,8 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +#define VHOST_MAX_PEND	128
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -198,13 +200,33 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		vq->heads[vq->pend_idx].id = head;
> +		vq->heads[vq->pend_idx].len = 0;
> +		++vq->pend_idx;
> +		if (vq->pend_idx >= VHOST_MAX_PEND) {
> +			int id;
> +			id = vq->heads[vq->pend_idx-1].id;
> +			vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id; 
> +			vq->heads[vq->pend_idx-2].id = id;
> +			vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +						    vq->pend_idx);
> +			vq->pend_idx = 0;
> +		}
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
>  			break;
>  		}
>  	}
> +	if (vq->pend_idx >= VHOST_MAX_PEND) {
> +		int id;
> +		id = vq->heads[vq->pend_idx-1].id;
> +		vq->heads[vq->pend_idx-1].id = vq->heads[vq->pend_idx-2].id;
> +		vq->heads[vq->pend_idx-2].id = id;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    vq->pend_idx);
> +		vq->pend_idx = 0;
> +	}
>  
>  	mutex_unlock(&vq->mutex);
>  }
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..7eea6b3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->pend_idx = 0;
>  }
>  
>  static int vhost_worker(void *data)
> @@ -395,6 +396,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		if (dev->vqs[i].pend_idx != 0) {
> +			vhost_add_used_and_signal_n(dev, &dev->vqs[i],
> +				dev->vqs[i].heads, dev->vqs[i].pend_idx);
> +			dev->vqs[i].pend_idx = 0;
> +		}
>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +609,11 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	if (vq->pend_idx != 0) {
> +		vhost_add_used_and_signal_n(d, vq, vq->heads, vq->pend_idx);
> +		vq->pend_idx = 0;
> +	}
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..44a412d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -108,6 +108,7 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	int pend_idx;
>  };
>  
>  struct vhost_dev {
> 

^ permalink raw reply

* Re: [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Shirley Ma @ 2011-05-17 20:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ben Hutchings, David Miller, Eric Dumazet, Avi Kivity,
	Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <20110517062111.GD26989@redhat.com>

On Tue, 2011-05-17 at 09:21 +0300, Michael S. Tsirkin wrote:
> Problem is, in your patch there are a set of restrictions on what the
> device can do with the skb that we need to enforce somehow.
> Also, how do we know it's a 'real NIC' and not a software device?

I checked macvtap newlink, it doesn't seem to block software device.

So we need to work on the wider feature bit first.

Shirley

^ permalink raw reply

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
From: Michael S. Tsirkin @ 2011-05-17 20:58 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <1305665419.10756.33.camel@localhost.localdomain>

On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> Resubmit the patch with most update. This patch passed some
> live-migration test against RHEL6.2. I will run more stress test w/i
> live migration.

What changed from the last version?

> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
> 
>  drivers/vhost/net.c   |   37 +++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.c |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |   12 ++++++++++
>  3 files changed, 101 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 2f7c76a..6bd6e28 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,9 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
>  
> +/* MAX number of TX used buffers for outstanding zerocopy */
> +#define VHOST_MAX_ZEROCOPY_PEND 128 
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	struct skb_ubuf_info pend;
>  
>  	/* TODO: check that we are running from vhost_worker? */
>  	sock = rcu_dereference_check(vq->private_data, 1);
> @@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = vq->vhost_hlen;
>  
>  	for (;;) {
> +		/* Release DMAs done buffers first */
> +		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
> +			vhost_zerocopy_signal_used(vq, false);
> +
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
>  					 &out, &in,
> @@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>  				break;
>  			}
> +			/* If more outstanding DMAs, queue the work */
> +			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
> +			    (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
> +				tx_poll_start(net, sock);
> +				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
> +				break;
> +			}
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				vhost_disable_notify(vq);
>  				continue;
> @@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> +		/* use msg_control to pass vhost zerocopy ubuf info to skb */
> +		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
> +			vq->heads[vq->upend_idx].id = head;
> +			if (len <= 128)
> +				vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
> +			else {
> +				vq->heads[vq->upend_idx].len = len;
> +				pend.callback = vhost_zerocopy_callback;
> +				pend.arg = vq;
> +				pend.desc = vq->upend_idx;
> +				msg.msg_control = &pend;
> +				msg.msg_controllen = sizeof(pend);
> +			}
> +			atomic_inc(&vq->refcnt);
> +			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
> +		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq, 1);
> +			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +				vhost_discard_vq_desc(vq, 1);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 2ab2912..ce799d6 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	vq->upend_idx = 0;
> +	vq->done_idx = 0;
> +	atomic_set(&vq->refcnt, 0);
>  }
>  
>  static int vhost_worker(void *data)
> @@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>  					       UIO_MAXIOV, GFP_KERNEL);
>  		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
>  					  GFP_KERNEL);
> -		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
> +		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
>  					    UIO_MAXIOV, GFP_KERNEL);
>  
>  		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
> @@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>  	return 0;
>  }
>  
> +/* 
> +	comments
> +*/
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
> +{
> +	int i, j = 0;
> +
> +	i = vq->done_idx;
> +	while (i != vq->upend_idx) {
> +		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
> +			/* reset len = 0 */
> +			vq->heads[i].len = 0;
> +			i = (i + 1) % UIO_MAXIOV;
> +			++j;
> +		} else
> +			break;
> +	}
> +	if (j) {
> +		/* comments */
> +		if (i > vq->done_idx)
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
> +		else {
> +			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
> +					 UIO_MAXIOV - vq->done_idx);
> +			vhost_add_used_n(vq, vq->heads, i);
> +		}
> +		vq->done_idx = i;
> +		vhost_signal(vq->dev, vq);
> +		atomic_sub(j, &vq->refcnt);
> +	}
> +}
> +
>  /* Caller should have device mutex */
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
> @@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  			vhost_poll_stop(&dev->vqs[i].poll);
>  			vhost_poll_flush(&dev->vqs[i].poll);
>  		}
> +		/* wait for all lower device DMAs done, then notify guest */
> +		if (atomic_read(&dev->vqs[i].refcnt)) {
> +			msleep(1000);
> +			vhost_zerocopy_signal_used(&dev->vqs[i], true);
> +		}
>  		if (dev->vqs[i].error_ctx)
>  			eventfd_ctx_put(dev->vqs[i].error_ctx);
>  		if (dev->vqs[i].error)
> @@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  
>  	mutex_lock(&vq->mutex);
>  
> +	/* force all lower device DMAs done */
> +	if (atomic_read(&vq->refcnt)) 
> +		vhost_zerocopy_signal_used(vq, true);
> +
>  	switch (ioctl) {
>  	case VHOST_SET_VRING_NUM:
>  		/* Resizing ring with an active backend?
> @@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> +
> +void vhost_zerocopy_callback(struct sk_buff *skb)
> +{
> +	int idx = skb_shinfo(skb)->ubuf.desc;
> +	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
> +
> +	/* set len = 1 to mark this desc buffers done DMA */
> +	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
> +}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index b3363ae..8e3ecc7 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -13,6 +13,10 @@
>  #include <linux/virtio_ring.h>
>  #include <asm/atomic.h>
>  
> +/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
> + * done */
> +#define VHOST_DMA_DONE_LEN	1
> +
>  struct vhost_device;
>  
>  struct vhost_work;
> @@ -108,6 +112,12 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* vhost zerocopy support */
> +	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
> +	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
> +	int upend_idx;
> +	/* copy of used idx to monintor DMA done zerocopy buffers */
> +	int done_idx;
>  };
>  
>  struct vhost_dev {
> @@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> +void vhost_zerocopy_callback(struct sk_buff *skb);
> +void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> 

^ permalink raw reply

* Re: small RPS cache for fragments?
From: Eric Dumazet @ 2011-05-17 21:00 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, netdev
In-Reply-To: <20110517.164929.1737248436066795381.davem@davemloft.net>

Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> From: Tom Herbert <therbert@google.com>
> Date: Tue, 17 May 2011 13:02:25 -0700
> 
> > I like it!  And this sounds like the sort of algorithm that NICs might
> > be able to implement to solve the UDP/RSS unpleasantness, so even
> > better.
> 
> Actually, I think it won't work.  Even Linux emits fragments last to
> first, so we won't see the UDP header until the last packet where it's
> no longer useful.
> 
> Back to the drawing board. :-/

Well, we could just use the iph->id in the rxhash computation for frags.

At least all frags of a given datagram should be reassembled on same
cpu, so we get RPS (but not RFS)




^ permalink raw reply

* Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
From: Shirley Ma @ 2011-05-17 21:01 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev,
	kvm, linux-kernel
In-Reply-To: <20110517205827.GB7589@redhat.com>

On Tue, 2011-05-17 at 23:58 +0300, Michael S. Tsirkin wrote:
> On Tue, May 17, 2011 at 01:50:19PM -0700, Shirley Ma wrote:
> > Resubmit the patch with most update. This patch passed some
> > live-migration test against RHEL6.2. I will run more stress test w/i
> > live migration.
> 
> What changed from the last version?

Sorry, I forgot to mention: add clean up pending before set_vring.

Thanks
Shirley

^ permalink raw reply

* [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: Ben Hutchings @ 2011-05-17 21:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110517.152651.1785846817320651943.davem@davemloft.net>

gcc 4.5 doesn't like enumerators as flags, and neither does davem.
Replace the enumerated type with a 'bitwise' type alias which makes it
clear where these flags are being used and allows sparse to validate
their use.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
You wrote:
> What part of "get rid of the enum" is so hard to understand?
>
> I'll say it again: Please get rid of the enum efx_fc_type

OK, done.

> You can define bit positions if you like, because those will
> be used as a proper enum, as unique bit positions within
> a set of bits.  Then you can define macros as (1 << XXX)

But that seems to result in discarding all type information for the
flags. I would prefer to improve type checking.

Ben.

 drivers/net/sfc/efx.c        |    2 +-
 drivers/net/sfc/efx.h        |    2 +-
 drivers/net/sfc/ethtool.c    |    2 +-
 drivers/net/sfc/mdio_10g.c   |   13 ++++++++-----
 drivers/net/sfc/mdio_10g.h   |    2 +-
 drivers/net/sfc/net_driver.h |   15 +++++++--------
 6 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 05502b3..f840879 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -833,7 +833,7 @@ void efx_link_set_advertising(struct efx_nic *efx, u32 advertising)
 	}
 }
 
-void efx_link_set_wanted_fc(struct efx_nic *efx, enum efx_fc_type wanted_fc)
+void efx_link_set_wanted_fc(struct efx_nic *efx, efx_fc_mode wanted_fc)
 {
 	efx->wanted_fc = wanted_fc;
 	if (efx->link_advertising) {
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index 3d83a1f..242d68b 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -142,6 +142,6 @@ static inline void efx_schedule_channel(struct efx_channel *channel)
 
 extern void efx_link_status_changed(struct efx_nic *efx);
 extern void efx_link_set_advertising(struct efx_nic *efx, u32);
-extern void efx_link_set_wanted_fc(struct efx_nic *efx, enum efx_fc_type);
+extern void efx_link_set_wanted_fc(struct efx_nic *efx, efx_fc_mode);
 
 #endif /* EFX_EFX_H */
diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index 348437a..114829d 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -698,7 +698,7 @@ static int efx_ethtool_set_pauseparam(struct net_device *net_dev,
 				      struct ethtool_pauseparam *pause)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
-	enum efx_fc_type wanted_fc, old_fc;
+	efx_fc_mode wanted_fc, old_fc;
 	u32 old_adv;
 	bool reset;
 	int rc = 0;
diff --git a/drivers/net/sfc/mdio_10g.c b/drivers/net/sfc/mdio_10g.c
index 7115914..9be5da6 100644
--- a/drivers/net/sfc/mdio_10g.c
+++ b/drivers/net/sfc/mdio_10g.c
@@ -284,18 +284,21 @@ void efx_mdio_an_reconfigure(struct efx_nic *efx)
 	efx_mdio_write(efx, MDIO_MMD_AN, MDIO_CTRL1, reg);
 }
 
-enum efx_fc_type efx_mdio_get_pause(struct efx_nic *efx)
+efx_fc_mode efx_mdio_get_pause(struct efx_nic *efx)
 {
-	BUILD_BUG_ON(EFX_FC_AUTO & (EFX_FC_RX | EFX_FC_TX));
+	BUILD_BUG_ON((__force u8)EFX_FC_TX != FLOW_CTRL_TX);
+	BUILD_BUG_ON((__force u8)EFX_FC_RX != FLOW_CTRL_RX);
 
 	if (!(efx->wanted_fc & EFX_FC_AUTO))
 		return efx->wanted_fc;
 
 	WARN_ON(!(efx->mdio.mmds & MDIO_DEVS_AN));
 
-	return mii_resolve_flowctrl_fdx(
-		mii_advertise_flowctrl(efx->wanted_fc),
-		efx_mdio_read(efx, MDIO_MMD_AN, MDIO_AN_LPA));
+	return (__force efx_fc_mode)
+		mii_resolve_flowctrl_fdx(
+			mii_advertise_flowctrl((__force int)
+					       (efx->wanted_fc & ~EFX_FC_AUTO)),
+			efx_mdio_read(efx, MDIO_MMD_AN, MDIO_AN_LPA));
 }
 
 int efx_mdio_test_alive(struct efx_nic *efx)
diff --git a/drivers/net/sfc/mdio_10g.h b/drivers/net/sfc/mdio_10g.h
index df07039..1616b13 100644
--- a/drivers/net/sfc/mdio_10g.h
+++ b/drivers/net/sfc/mdio_10g.h
@@ -92,7 +92,7 @@ extern void efx_mdio_an_reconfigure(struct efx_nic *efx);
 /* Get pause parameters from AN if available (otherwise return
  * requested pause parameters)
  */
-enum efx_fc_type efx_mdio_get_pause(struct efx_nic *efx);
+extern efx_fc_mode efx_mdio_get_pause(struct efx_nic *efx);
 
 /* Wait for specified MMDs to exit reset within a timeout */
 extern int efx_mdio_wait_reset_mmds(struct efx_nic *efx,
diff --git a/drivers/net/sfc/net_driver.h b/drivers/net/sfc/net_driver.h
index ce9697b..1c29775 100644
--- a/drivers/net/sfc/net_driver.h
+++ b/drivers/net/sfc/net_driver.h
@@ -448,12 +448,11 @@ enum nic_state {
 /* Forward declaration */
 struct efx_nic;
 
-/* Pseudo bit-mask flow control field */
-enum efx_fc_type {
-	EFX_FC_RX = FLOW_CTRL_RX,
-	EFX_FC_TX = FLOW_CTRL_TX,
-	EFX_FC_AUTO = 4,
-};
+/* Flow control flags */
+typedef unsigned int __bitwise efx_fc_mode;
+#define EFX_FC_TX	((__force efx_fc_mode) 1)
+#define EFX_FC_RX	((__force efx_fc_mode) 2)
+#define EFX_FC_AUTO	((__force efx_fc_mode) 4)
 
 /**
  * struct efx_link_state - Current state of the link
@@ -465,7 +464,7 @@ enum efx_fc_type {
 struct efx_link_state {
 	bool up;
 	bool fd;
-	enum efx_fc_type fc;
+	efx_fc_mode fc;
 	unsigned int speed;
 };
 
@@ -784,7 +783,7 @@ struct efx_nic {
 
 	bool promiscuous;
 	union efx_multicast_hash multicast_hash;
-	enum efx_fc_type wanted_fc;
+	efx_fc_mode wanted_fc;
 
 	atomic_t rx_reset;
 	enum efx_loopback_mode loopback_mode;
-- 
1.7.4


-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: small RPS cache for fragments?
From: David Miller @ 2011-05-17 21:10 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev
In-Reply-To: <1305666050.2691.4.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 May 2011 23:00:50 +0200

> Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
>> From: Tom Herbert <therbert@google.com>
>> Date: Tue, 17 May 2011 13:02:25 -0700
>> 
>> > I like it!  And this sounds like the sort of algorithm that NICs might
>> > be able to implement to solve the UDP/RSS unpleasantness, so even
>> > better.
>> 
>> Actually, I think it won't work.  Even Linux emits fragments last to
>> first, so we won't see the UDP header until the last packet where it's
>> no longer useful.
>> 
>> Back to the drawing board. :-/
> 
> Well, we could just use the iph->id in the rxhash computation for frags.
> 
> At least all frags of a given datagram should be reassembled on same
> cpu, so we get RPS (but not RFS)

That's true, but one could also argue that in the existing code at least
one of the packets (the one with the UDP header) would make it to the
proper flow cpu.

That could be as much as half of the packets.

So I don't yet see it as a clear win.

^ permalink raw reply

* Re: small RPS cache for fragments?
From: Rick Jones @ 2011-05-17 21:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert, netdev
In-Reply-To: <1305666050.2691.4.camel@edumazet-laptop>

On Tue, 2011-05-17 at 23:00 +0200, Eric Dumazet wrote:
> Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> > From: Tom Herbert <therbert@google.com>
> > Date: Tue, 17 May 2011 13:02:25 -0700
> > 
> > > I like it!  And this sounds like the sort of algorithm that NICs might
> > > be able to implement to solve the UDP/RSS unpleasantness, so even
> > > better.
> > 
> > Actually, I think it won't work.  Even Linux emits fragments last to
> > first, so we won't see the UDP header until the last packet where it's
> > no longer useful.
> > 
> > Back to the drawing board. :-/
> 
> Well, we could just use the iph->id in the rxhash computation for frags.
> 
> At least all frags of a given datagram should be reassembled on same
> cpu, so we get RPS (but not RFS)

Won't that just scatter the fragments of a given flow across processors?
Instead of then going back and forth between two caches - where
reassembly happens and then where the app is running, it will go back
and forth between the app's cache and pretty much nearly every other
cache in the system (or at least configured to take RPS traffic).

rick


^ permalink raw reply

* Re: small RPS cache for fragments?
From: Ben Hutchings @ 2011-05-17 21:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert, netdev
In-Reply-To: <1305666050.2691.4.camel@edumazet-laptop>

On Tue, 2011-05-17 at 23:00 +0200, Eric Dumazet wrote:
> Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> > From: Tom Herbert <therbert@google.com>
> > Date: Tue, 17 May 2011 13:02:25 -0700
> > 
> > > I like it!  And this sounds like the sort of algorithm that NICs might
> > > be able to implement to solve the UDP/RSS unpleasantness, so even
> > > better.
> > 
> > Actually, I think it won't work.  Even Linux emits fragments last to
> > first, so we won't see the UDP header until the last packet where it's
> > no longer useful.
> > 
> > Back to the drawing board. :-/
> 
> Well, we could just use the iph->id in the rxhash computation for frags.

But then each datagram lands on a different CPU, and reordering is
liable to happen far more often than it does now.

> At least all frags of a given datagram should be reassembled on same
> cpu, so we get RPS (but not RFS)

You could still do RPS with just IP addresses (same as RSS using
Toeplitz hashes).

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: small RPS cache for fragments?
From: Rick Jones @ 2011-05-17 21:13 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <20110517.171000.1166144155994185790.davem@davemloft.net>

On Tue, 2011-05-17 at 17:10 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 17 May 2011 23:00:50 +0200
> 
> > Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> >> From: Tom Herbert <therbert@google.com>
> >> Date: Tue, 17 May 2011 13:02:25 -0700
> >> 
> >> > I like it!  And this sounds like the sort of algorithm that NICs might
> >> > be able to implement to solve the UDP/RSS unpleasantness, so even
> >> > better.
> >> 
> >> Actually, I think it won't work.  Even Linux emits fragments last to
> >> first, so we won't see the UDP header until the last packet where it's
> >> no longer useful.
> >> 
> >> Back to the drawing board. :-/
> > 
> > Well, we could just use the iph->id in the rxhash computation for frags.
> > 
> > At least all frags of a given datagram should be reassembled on same
> > cpu, so we get RPS (but not RFS)
> 
> That's true, but one could also argue that in the existing code at least
> one of the packets (the one with the UDP header) would make it to the
> proper flow cpu.
> 
> That could be as much as half of the packets.
> 
> So I don't yet see it as a clear win.

How heinous would it be to do post-reassembly RFS?

rick 


^ permalink raw reply

* Re: small RPS cache for fragments?
From: Ben Hutchings @ 2011-05-17 21:13 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <20110517.171000.1166144155994185790.davem@davemloft.net>

On Tue, 2011-05-17 at 17:10 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 17 May 2011 23:00:50 +0200
> 
> > Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> >> From: Tom Herbert <therbert@google.com>
> >> Date: Tue, 17 May 2011 13:02:25 -0700
> >> 
> >> > I like it!  And this sounds like the sort of algorithm that NICs might
> >> > be able to implement to solve the UDP/RSS unpleasantness, so even
> >> > better.
> >> 
> >> Actually, I think it won't work.  Even Linux emits fragments last to
> >> first, so we won't see the UDP header until the last packet where it's
> >> no longer useful.
> >> 
> >> Back to the drawing board. :-/
> > 
> > Well, we could just use the iph->id in the rxhash computation for frags.
> > 
> > At least all frags of a given datagram should be reassembled on same
> > cpu, so we get RPS (but not RFS)
> 
> That's true, but one could also argue that in the existing code at least
> one of the packets (the one with the UDP header) would make it to the
> proper flow cpu.

No, we ignore the layer-4 header when either MF or OFFSET is non-zero.

Ben.

> That could be as much as half of the packets.
> 
> So I don't yet see it as a clear win.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* RFC: Add WARN_RATELIMIT to bug.h (was: Re: [PATCH] sk-filter: Rate-limit WARNing, print dbg info.)
From: Joe Perches @ 2011-05-17 21:13 UTC (permalink / raw)
  To: Ben Greear, Arnd Bergmann; +Cc: netdev, LKML, linux-arch
In-Reply-To: <4DD2DB9D.6050306@candelatech.com>

On Tue, 2011-05-17 at 13:33 -0700, Ben Greear wrote:
> On 05/17/2011 12:53 PM, Joe Perches wrote:
> > Another option is to add a WARN_RATELIMIT
> > (there is already a WARN_ON_RATELIMIT) to avoid
> > missing other messages.
> > Something like:
> >   include/asm-generic/bug.h |   16 ++++++++++++++++
> >   net/core/filter.c         |    4 +++-
> >   2 files changed, 19 insertions(+), 1 deletions(-)
> > diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
> > index e5a3f58..12b250c 100644
> > --- a/include/asm-generic/bug.h
> > +++ b/include/asm-generic/bug.h
> > @@ -165,6 +165,22 @@ extern void warn_slowpath_null(const char *file, const int line);
> >   #define WARN_ON_RATELIMIT(condition, state)			\
> >   		WARN_ON((condition)&&  __ratelimit(state))
> >
> > +#define __WARN_RATELIMIT(condition, state, format...)		\
> > +({								\
> > +	int rtn = 0;						\
> > +	if (unlikely(__ratelimit(state)))			\
> > +		rtn = WARN(condition, format);			\
> > +	rtn;							\
> > +})
> > +
> > +#define WARN_RATELIMIT(condition, format...)			\
> > +({								\
> > +	static DEFINE_RATELIMIT_STATE(_rs,			\
> > +				      DEFAULT_RATELIMIT_INTERVAL,	\
> > +				      DEFAULT_RATELIMIT_BURST);	\
> > +	__WARN_RATELIMIT(condition,&_rs, format);		\
> > +})
> > +
> >   /*
> >    * WARN_ON_SMP() is for cases that the warning is either
> >    * meaningless for !SMP or may even cause failures.
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 0eb8c44..0e3622f 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -350,7 +350,9 @@ load_b:
> >   			continue;
> >   		}
> >   		default:
> > -			WARN_ON(1);
> > +			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
> > +				       fentry->code, fentry->jt,
> > +				       fentry->jf, fentry->k);
> >   			return 0;
> >   		}
> >   	}
> >

> Sounds fine to me.  You want to just send an official
> patch with all this?

I added a some cc's for wider exposure.
original post: http://www.spinics.net/lists/netdev/msg164521.html

Let's wait to see if David has anything to say.

The biggest negative I see is adding RATELIMIT_STATE
to asm-generic/bug.h, though it already has a use of
__ratelimit in WARN_ON_RATELIMIT there.

WARN_ON_RATELIMIT is unused today.

The only user seems to have been RCU_PREEMPT
which was deleted in:

commit 6b3ef48adf847f7adf11c870e3ffacac150f1564
Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Date:   Sat Aug 22 13:56:53 2009 -0700

    rcu: Remove CONFIG_PREEMPT_RCU

Maybe the old definition should be removed instead.

If there are no comments after a day or two, I'll
sign and send this as 2 patches with the filter
one marked as:

Original-patch-by: Ben Greear <greearb@candelatech.com>

cheers, Joe


^ permalink raw reply

* Re: [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: David Miller @ 2011-05-17 21:14 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev
In-Reply-To: <1305666379.2848.45.camel@bwh-desktop>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 17 May 2011 22:06:19 +0100

> But that seems to result in discarding all type information for the
> flags. I would prefer to improve type checking.

That's why I don't want a solution like your cast patch, as that
takes the typing information away.

Accept that the compiler currently doesn't want to allow enums to be
used as bit-masks, don't paper around it.

I'm not applying this patch either, you think all of this "__force"
casting stuff is better?  No way.


^ permalink raw reply

* Re: [PATCH net-next-2.6] sfc: Replace enum efx_fc_type with a 'bitwise' type
From: Joe Perches @ 2011-05-17 21:22 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, netdev
In-Reply-To: <20110517.171412.1017451005914294196.davem@davemloft.net>

On Tue, 2011-05-17 at 17:14 -0400, David Miller wrote:
> Accept that the compiler currently doesn't want to allow enums to be
> used as bit-masks, don't paper around it.

A patch applied yesterday using enums as bitmasks:
http://patchwork.ozlabs.org/patch/95802/



^ permalink raw reply

* Re: small RPS cache for fragments?
From: David Miller @ 2011-05-17 21:26 UTC (permalink / raw)
  To: bhutchings; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <1305666822.2848.51.camel@bwh-desktop>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 17 May 2011 22:13:42 +0100

> On Tue, 2011-05-17 at 17:10 -0400, David Miller wrote:
>> That's true, but one could also argue that in the existing code at least
>> one of the packets (the one with the UDP header) would make it to the
>> proper flow cpu.
> 
> No, we ignore the layer-4 header when either MF or OFFSET is non-zero.

That's right and I now remember we had quite a discussion about this
in the past.

So IP/saddr/daddr keying is out of the question due to reordering
concerns.

The idea to do RFS post fragmentation is interesting, it's sort of
another form of GRO.  We would need to re-fragment (like GRO does)
in the forwarding case.

But it would be nice since it would reduce the number of calls into
the stack (and thus route lookups, etc.) per fragmented frame.

There is of course the issue of fragmentation queue timeouts, and
what semantics of that means when we are not the final destination
and those fragments would have been forwarded rather than consumed
by us.


^ permalink raw reply

* Re: small RPS cache for fragments?
From: Tom Herbert @ 2011-05-17 21:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110517.164929.1737248436066795381.davem@davemloft.net>

> Actually, I think it won't work.  Even Linux emits fragments last to
> first, so we won't see the UDP header until the last packet where it's
> no longer useful.
>
I remember observing this a while back, what's the rationale for it?

> Back to the drawing board. :-/
>

^ permalink raw reply

* Re: small RPS cache for fragments?
From: Eric Dumazet @ 2011-05-17 21:27 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, therbert, netdev
In-Reply-To: <1305666822.2848.51.camel@bwh-desktop>

Le mardi 17 mai 2011 à 22:13 +0100, Ben Hutchings a écrit :
> On Tue, 2011-05-17 at 17:10 -0400, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Tue, 17 May 2011 23:00:50 +0200
> > 
> > > Le mardi 17 mai 2011 à 16:49 -0400, David Miller a écrit :
> > >> From: Tom Herbert <therbert@google.com>
> > >> Date: Tue, 17 May 2011 13:02:25 -0700
> > >> 
> > >> > I like it!  And this sounds like the sort of algorithm that NICs might
> > >> > be able to implement to solve the UDP/RSS unpleasantness, so even
> > >> > better.
> > >> 
> > >> Actually, I think it won't work.  Even Linux emits fragments last to
> > >> first, so we won't see the UDP header until the last packet where it's
> > >> no longer useful.
> > >> 
> > >> Back to the drawing board. :-/
> > > 
> > > Well, we could just use the iph->id in the rxhash computation for frags.
> > > 
> > > At least all frags of a given datagram should be reassembled on same
> > > cpu, so we get RPS (but not RFS)
> > 
> > That's true, but one could also argue that in the existing code at least
> > one of the packets (the one with the UDP header) would make it to the
> > proper flow cpu.
> 
> No, we ignore the layer-4 header when either MF or OFFSET is non-zero.

Exactly

As is, RPS (based on our software rxhash computation) should be working
fine with frags, unless we receive different flows with same
(src_addr,dst_addr) pair.

This is why I asked David if real workloads could hit one cpu instead of
many ones.




^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox