Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 1/2] r8169: failure to enable mwi should not be fatal
From: David Miller @ 2010-04-26 22:37 UTC (permalink / raw)
  To: romieu; +Cc: netdev, conikost, dave
In-Reply-To: <20100426214206.GA7624@electric-eye.fr.zoreil.com>

From: François Romieu <romieu@fr.zoreil.com>
Date: Mon, 26 Apr 2010 23:42:06 +0200

> Few (6) network drivers enable mwi explicitly. Fewer worry about a
> failure.
> 
> It is not a fix but it should avoid some annoyance like
> http://bugzilla.kernel.org/show_bug.cgi?id=15454
> 
> Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
> Cc: Conrad Kostecki <conikost@gmx.de>

Applied.

^ permalink raw reply

* Re: [PATCH 1/4] e1000e: save skb counts in TX to avoid cache misses
From: Jeff Kirsher @ 2010-04-26 22:01 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <alpine.DEB.1.00.1004231719050.16806@pokey.mtv.corp.google.com>

On Fri, Apr 23, 2010 at 17:25, Tom Herbert <therbert@google.com> wrote:
> In e1000_tx_map, precompute number of segements and bytecounts which
> are derived from fields in skb; these are stored in buffer_info.  When
> cleaning tx in e1000_clean_tx_irq use the values in the associated
> buffer_info for statistics counting, this eliminates cache misses
> on skb fields.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
> diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
> index 12648a1..d6da75b 100644
> --- a/drivers/net/e1000e/e1000.h
> +++ b/drivers/net/e1000e/e1000.h
> @@ -188,6 +188,8 @@ struct e1000_buffer {
>                        unsigned long time_stamp;
>                        u16 length;
>                        u16 next_to_watch;
> +                       unsigned int segs;
> +                       unsigned int bytecount;
>                        u16 mapped_as_page;
>                };
>                /* Rx */
> diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
> index 5f70c43..4f5034a 100644
> --- a/drivers/net/e1000e/netdev.c
> +++ b/drivers/net/e1000e/netdev.c
> @@ -646,14 +635,8 @@ static bool e1000_clean_tx_irq(struct e1000_adapter *adapter)
>                        cleaned = (i == eop);
>
>                        if (cleaned) {
> -                               struct sk_buff *skb = buffer_info->skb;
> -                               unsigned int segs, bytecount;
> -                               segs = skb_shinfo(skb)->gso_segs ?: 1;
> -                               /* multiply data chunks by size of headers */
> -                               bytecount = ((segs - 1) * skb_headlen(skb)) +
> -                                           skb->len;
> -                               total_tx_packets += segs;
> -                               total_tx_bytes += bytecount;
> +                               total_tx_packets += buffer_info->segs;
> +                               total_tx_bytes += buffer_info->bytecount;
>                        }
>
>                        e1000_put_txbuf(adapter, buffer_info);
> @@ -3906,7 +3889,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
>        struct e1000_buffer *buffer_info;
>        unsigned int len = skb_headlen(skb);
>        unsigned int offset = 0, size, count = 0, i;
> -       unsigned int f;
> +       unsigned int f, bytecount, segs;
>
>        i = tx_ring->next_to_use;
>
> @@ -3965,7 +3948,13 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
>                }
>        }
>
> +       segs = skb_shinfo(skb)->gso_segs ?: 1;
> +       /* multiply data chunks by size of headers */
> +       bytecount = ((segs - 1) * skb_headlen(skb)) + skb->len;
> +
>        tx_ring->buffer_info[i].skb = skb;
> +       tx_ring->buffer_info[i].segs = segs;
> +       tx_ring->buffer_info[i].bytecount = bytecount;
>        tx_ring->buffer_info[first].next_to_watch = i;
>
>        return count;
> --
>

I have added your patches to my queue of e1000e patches, thanks.

-- 
Cheers,
Jeff

^ permalink raw reply

* [PATCH 1/2] r8169: failure to enable mwi should not be fatal
From: François Romieu @ 2010-04-26 21:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Conrad Kostecki, David Dillow

Few (6) network drivers enable mwi explicitly. Fewer worry about a
failure.

It is not a fix but it should avoid some annoyance like
http://bugzilla.kernel.org/show_bug.cgi?id=15454

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Cc: Conrad Kostecki <conikost@gmx.de>
---
 drivers/net/r8169.c |   27 +++++++++++++--------------
 1 files changed, 13 insertions(+), 14 deletions(-)


diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index dbb1f5a..2b54389 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2759,6 +2759,7 @@ static void rtl8169_release_board(struct pci_dev *pdev, struct net_device *dev,
 {
 	iounmap(ioaddr);
 	pci_release_regions(pdev);
+	pci_clear_mwi(pdev);
 	pci_disable_device(pdev);
 	free_netdev(dev);
 }
@@ -3014,9 +3015,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_free_dev_1;
 	}
 
-	rc = pci_set_mwi(pdev);
-	if (rc < 0)
-		goto err_out_disable_2;
+	if (pci_set_mwi(pdev) < 0)
+		netif_info(tp, probe, dev, "Mem-Wr-Inval unavailable\n");
 
 	/* make sure PCI base addr 1 is MMIO */
 	if (!(pci_resource_flags(pdev, region) & IORESOURCE_MEM)) {
@@ -3024,7 +3024,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			  "region #%d not an MMIO resource, aborting\n",
 			  region);
 		rc = -ENODEV;
-		goto err_out_mwi_3;
+		goto err_out_mwi_2;
 	}
 
 	/* check for weird/broken PCI region reporting */
@@ -3032,13 +3032,13 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netif_err(tp, probe, dev,
 			  "Invalid PCI region size(s), aborting\n");
 		rc = -ENODEV;
-		goto err_out_mwi_3;
+		goto err_out_mwi_2;
 	}
 
 	rc = pci_request_regions(pdev, MODULENAME);
 	if (rc < 0) {
 		netif_err(tp, probe, dev, "could not request regions\n");
-		goto err_out_mwi_3;
+		goto err_out_mwi_2;
 	}
 
 	tp->cp_cmd = PCIMulRW | RxChkSum;
@@ -3051,7 +3051,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc < 0) {
 			netif_err(tp, probe, dev, "DMA configuration failed\n");
-			goto err_out_free_res_4;
+			goto err_out_free_res_3;
 		}
 	}
 
@@ -3060,7 +3060,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (!ioaddr) {
 		netif_err(tp, probe, dev, "cannot remap MMIO, aborting\n");
 		rc = -EIO;
-		goto err_out_free_res_4;
+		goto err_out_free_res_3;
 	}
 
 	tp->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
@@ -3102,7 +3102,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (i == ARRAY_SIZE(rtl_chip_info)) {
 		dev_err(&pdev->dev,
 			"driver bug, MAC version not found in rtl_chip_info\n");
-		goto err_out_msi_5;
+		goto err_out_msi_4;
 	}
 	tp->chipset = i;
 
@@ -3167,7 +3167,7 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rc = register_netdev(dev);
 	if (rc < 0)
-		goto err_out_msi_5;
+		goto err_out_msi_4;
 
 	pci_set_drvdata(pdev, dev);
 
@@ -3190,14 +3190,13 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 out:
 	return rc;
 
-err_out_msi_5:
+err_out_msi_4:
 	rtl_disable_msi(pdev, tp);
 	iounmap(ioaddr);
-err_out_free_res_4:
+err_out_free_res_3:
 	pci_release_regions(pdev);
-err_out_mwi_3:
+err_out_mwi_2:
 	pci_clear_mwi(pdev);
-err_out_disable_2:
 	pci_disable_device(pdev);
 err_out_free_dev_1:
 	free_netdev(dev);
-- 
1.6.6.1


^ permalink raw reply related

* [PATCH 2/2] r8169: more broken register writes workaround
From: François Romieu @ 2010-04-26 21:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Al Viro

78f1cd02457252e1ffbc6caa44a17424a45286b8 ("fix broken register writes")
does not work for Al Viro's r8169 (XID 18000000).

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
---
 drivers/net/r8169.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 2b54389..4748c21 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2826,8 +2826,13 @@ static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr)
 	spin_lock_irq(&tp->lock);
 
 	RTL_W8(Cfg9346, Cfg9346_Unlock);
+
 	RTL_W32(MAC4, high);
+	RTL_R32(MAC4);
+
 	RTL_W32(MAC0, low);
+	RTL_R32(MAC0);
+
 	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	spin_unlock_irq(&tp->lock);
-- 
1.6.6.1


^ permalink raw reply related

* Re: [PATCH v6] Add mergeable rx buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-26 21:42 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, virtualization, kvm, rusty
In-Reply-To: <1272316852.18945.4.camel@w-dls.beaverton.ibm.com>

On Mon, Apr 26, 2010 at 02:20:52PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> 					+-DLS
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

OK, looks good. I still think iovec handling is a bit off,
as commented below.

> diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v6/drivers/vhost/net.c
> --- net-next-v0/drivers/vhost/net.c	2010-04-24 21:36:54.000000000 -0700
> +++ net-next-v6/drivers/vhost/net.c	2010-04-26 01:13:04.000000000 -0700
> @@ -109,7 +109,7 @@ static void handle_tx(struct vhost_net *
>  	};
>  	size_t len, total_len = 0;
>  	int err, wmem;
> -	size_t hdr_size;
> +	size_t vhost_hlen;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock)
>  		return;
> @@ -128,13 +128,13 @@ static void handle_tx(struct vhost_net *
>  
>  	if (wmem < sock->sk->sk_sndbuf / 2)
>  		tx_poll_stop(net);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 NULL, NULL);
> +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> +				      ARRAY_SIZE(vq->iov),
> +				      &out, &in,
> +				      NULL, NULL);
>  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>  		if (head == vq->num) {
>  			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
> @@ -155,20 +155,20 @@ static void handle_tx(struct vhost_net *
>  			break;
>  		}
>  		/* Skip header. TODO: support TSO. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> +		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, out);
>  		msg.msg_iovlen = out;
>  		len = iov_length(vq->iov, out);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for TX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, 1);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
> @@ -187,12 +187,25 @@ static void handle_tx(struct vhost_net *
>  	unuse_mm(net->dev.mm);
>  }
>  
> +static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
> +{
> +	struct sk_buff *head;
> +	int len = 0;
> +
> +	lock_sock(sk);
> +	head = skb_peek(&sk->sk_receive_queue);
> +	if (head)
> +		len = head->len + vq->sock_hlen;
> +	release_sock(sk);
> +	return len;
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -	unsigned head, out, in, log, s;
> +	unsigned in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -203,14 +216,14 @@ static void handle_rx(struct vhost_net *
>  		.msg_flags = MSG_DONTWAIT,
>  	};
>  
> -	struct virtio_net_hdr hdr = {
> -		.flags = 0,
> -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> +		.hdr.flags = 0,
> +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  	};
>  
>  	size_t len, total_len = 0;
> -	int err;
> -	size_t hdr_size;
> +	int err, headcount, datalen;
> +	size_t vhost_hlen;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>  		return;
> @@ -218,18 +231,19 @@ static void handle_rx(struct vhost_net *
>  	use_mm(net->dev.mm);
>  	mutex_lock(&vq->mutex);
>  	vhost_disable_notify(vq);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(vq, sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads,
> +					     datalen + vhost_hlen,
> +					     &in, vq_log, &log);
> +		if (headcount < 0)
> +			break;
>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> +		if (!headcount) {
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
> @@ -241,46 +255,53 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> +		/* Skip header. TODO: support TSO. */
> +		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		err = sock->ops->recvmsg(NULL, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
> -		/* TODO: Should check and handle checksum. */
> -		if (err > len) {
> -			pr_err("Discarded truncated rx packet: "
> -			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +		if (err != datalen) {
> +			pr_err("Discarded rx packet: "
> +			       " len %d, expected %zd\n", err, datalen);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> +		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr,
> +				     vhost_hlen);
>  		if (err) {
>  			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
>  			       vq->iov->iov_base, err);
>  			break;
>  		}
> -		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> +			struct iovec *iov = vhost_hlen ? vq->hdr : vq->iov;
> +

Hmm, this looks a bit wrong:
	- memcpy_toiovec above can modify vq->hdr
	- recvmsg can modify vq->iov
it works for you because in practice the address isn't modified
(only length) and in practice the header fits in the first iovec
entry.

It is probably easiest to add another copy of the header,
call it, say, hdr_copy, and a variant of move_iovec_hdr
called, say, copy_iovec_hdr, which does not modify the
origin, just copies part of iovec, call copy_iovec_hdr before
move_iovec_hdr.
Oh and we'll need vq->hdr_size which is max of vq->vhost_hlen
and vq->sock_hlen to know how much to copy.

In practice we'll always copy 1 entry, so no performance issue.

> +			if (memcpy_toiovecend(iov, (unsigned char *)&headcount,
> +				      offsetof(typeof(hdr), num_buffers),
> +				      sizeof(hdr.num_buffers))) {
> +				vq_err(vq, "Failed num_buffers write");
> +				vhost_discard_desc(vq, headcount);
> +				break;
> +			}
> +		}
> +		len += vhost_hlen;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;
> @@ -561,9 +582,24 @@ done:
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -		sizeof(struct virtio_net_hdr) : 0;
> +	size_t vhost_hlen;
> +	size_t sock_hlen;
>  	int i;
> +
> +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> +		/* vhost provides vnet_hdr */
> +		vhost_hlen = sizeof(struct virtio_net_hdr);
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		sock_hlen = 0;
> +	} else {
> +		/* socket provides vnet_hdr */
> +		vhost_hlen = 0;
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		else
> +			sock_hlen = sizeof(struct virtio_net_hdr);
> +	}

logic is the same above, right?
So

	size_t hdr_len;
	if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
		hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
	else
		hdr_len = sizeof(struct virtio_net_hdr);

	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
		vhost_hlen = hdr_len;
		sock_hlen = 0;
	} else {
		sock_hlen = hdr_len;
		vhost_hlen = 0;
	}

looks a bit better, right?

> +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> +		/* vhost provides vnet_hdr */
> +		vhost_hlen = sizeof(struct virtio_net_hdr);
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		sock_hlen = 0;
> +	} else {
> +		/* socket provides vnet_hdr */
> +		vhost_hlen = 0;
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		else
> +			sock_hlen = sizeof(struct virtio_net_hdr);
> +	}
>  	mutex_lock(&n->dev.mutex);
>  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
>  	    !vhost_log_access_ok(&n->dev)) {
> @@ -574,7 +610,8 @@ static int vhost_net_set_features(struct
>  	smp_wmb();
>  	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
>  		mutex_lock(&n->vqs[i].mutex);
> -		n->vqs[i].hdr_size = hdr_size;
> +		n->vqs[i].vhost_hlen = vhost_hlen;
> +		n->vqs[i].sock_hlen = sock_hlen;
>  		mutex_unlock(&n->vqs[i].mutex);
>  	}
>  	vhost_net_flush(n);
> diff -ruNp net-next-v0/drivers/vhost/vhost.c net-next-v6/drivers/vhost/vhost.c
> --- net-next-v0/drivers/vhost/vhost.c	2010-04-22 11:31:57.000000000 -0700
> +++ net-next-v6/drivers/vhost/vhost.c	2010-04-26 11:18:58.000000000 -0700
> @@ -114,7 +114,8 @@ static void vhost_vq_reset(struct vhost_
>  	vq->used_flags = 0;
>  	vq->log_used = false;
>  	vq->log_addr = -1ull;
> -	vq->hdr_size = 0;
> +	vq->vhost_hlen = 0;
> +	vq->sock_hlen = 0;
>  	vq->private_data = NULL;
>  	vq->log_base = NULL;
>  	vq->error_ctx = NULL;
> @@ -861,6 +862,53 @@ static unsigned get_indirect(struct vhos
>  	return 0;
>  }
>  
> +/* This is a multi-buffer version of vhost_get_desc
> + * @vq		- the relevant virtqueue
> + * datalen	- data length we'll be reading
> + * @iovcount	- returned count of io vectors we fill
> + * @log		- vhost log
> + * @log_num	- log offset
> + *	returns number of buffer heads allocated, negative on error
> + */
> +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> +		     int datalen, unsigned *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num)
> +{
> +	unsigned int out, in;
> +	int seg = 0;
> +	int headcount = 0;
> +	int r;
> +
> +	while (datalen > 0) {
> +		if (headcount >= VHOST_NET_MAX_SG) {
> +			r = -ENOBUFS;
> +			goto err;
> +		}
> +		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
> +					      ARRAY_SIZE(vq->iov) - seg, &out,
> +					      &in, log, log_num);
> +		if (heads[headcount].id == vq->num) {
> +			r = 0;
> +			goto err;
> +		}
> +		if (out || in <= 0) {
> +			vq_err(vq, "unexpected descriptor format for RX: "
> +				"out %d, in %d\n", out, in);
> +			r = -EINVAL;
> +			goto err;
> +		}
> +		heads[headcount].len = iov_length(vq->iov + seg, in);
> +		datalen -= heads[headcount].len;
> +		++headcount;
> +		seg += in;
> +	}
> +	*iovcount = seg;
> +	return headcount;
> +err:
> +	vhost_discard_desc(vq, headcount);
> +	return r;
> +}
> +
>  /* This looks in the virtqueue and for the first available buffer, and converts
>   * it to an iovec for convenient access.  Since descriptors consist of some
>   * number of output then some number of input descriptors, it's actually two
> @@ -868,7 +916,7 @@ static unsigned get_indirect(struct vhos
>   *
>   * This function returns the descriptor number found, or vq->num (which
>   * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
>  			   struct iovec iov[], unsigned int iov_size,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num)
> @@ -986,9 +1034,9 @@ unsigned vhost_get_vq_desc(struct vhost_
>  }
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
>  {
> -	vq->last_avail_idx--;
> +	vq->last_avail_idx -= n;
>  }
>  
>  /* After we've used one of their buffers, we tell them about it.  We'll then
> @@ -1033,6 +1081,68 @@ int vhost_add_used(struct vhost_virtqueu
>  	return 0;
>  }
>  
> +static void vhost_log_used(struct vhost_virtqueue *vq,
> +			   struct vring_used_elem __user *used)
> +{
> +	/* Make sure data is seen before log. */
> +	smp_wmb();
> +	/* Log used ring entry write. */
> +	log_write(vq->log_base,
> +		  vq->log_addr +
> +		   ((void __user *)used - (void __user *)vq->used),
> +		  sizeof *used);
> +	/* Log used index update. */
> +	log_write(vq->log_base,
> +		  vq->log_addr + offsetof(struct vring_used, idx),
> +		  sizeof vq->used->idx);
> +	if (vq->log_ctx)
> +		eventfd_signal(vq->log_ctx, 1);
> +}
> +
> +static int __vhost_add_used_n(struct vhost_virtqueue *vq,
> +			    struct vring_used_elem *heads,
> +			    unsigned count)
> +{
> +	struct vring_used_elem __user *used;
> +	int start;
> +
> +	start = vq->last_used_idx % vq->num;
> +	used = vq->used->ring + start;
> +	if (copy_to_user(used, heads, count * sizeof *used)) {
> +		vq_err(vq, "Failed to write used");
> +		return -EFAULT;
> +	}
> +	/* Make sure buffer is written before we update index. */
> +	smp_wmb();
> +	if (put_user(vq->last_used_idx + count, &vq->used->idx)) {
> +		vq_err(vq, "Failed to increment used idx");
> +		return -EFAULT;
> +	}
> +	if (unlikely(vq->log_used))
> +		vhost_log_used(vq, used);
> +	vq->last_used_idx += count;
> +	return 0;
> +}
> +
> +/* After we've used one of their buffers, we tell them about it.  We'll then
> + * want to notify the guest, using eventfd. */
> +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> +		     unsigned count)
> +{
> +	int start, n, r;
> +
> +	start = vq->last_used_idx % vq->num;
> +	n = min(vq->num - start, count);

This can be just n = vq->num - start now.

> +	if (n < count) {
> +		r = __vhost_add_used_n(vq, heads, n);
> +		if (r < 0)
> +			return r;
> +		heads += n;
> +		count -= n;
> +	}
> +	return __vhost_add_used_n(vq, heads, count);
> +}
> +
>  /* This actually signals the guest, using eventfd. */
>  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  {
> @@ -1062,6 +1172,15 @@ void vhost_add_used_and_signal(struct vh
>  	vhost_signal(dev, vq);
>  }
>  
> +/* multi-buffer version of vhost_add_used_and_signal */
> +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct vring_used_elem *heads, unsigned count)
> +{
> +	vhost_add_used_n(vq, heads, count);
> +	vhost_signal(dev, vq);
> +}
> +
>  /* OK, now we need to know about added descriptors. */
>  bool vhost_enable_notify(struct vhost_virtqueue *vq)
>  {
> @@ -1086,7 +1205,7 @@ bool vhost_enable_notify(struct vhost_vi
>  		return false;
>  	}
>  
> -	return avail_idx != vq->last_avail_idx;
> +	return avail_idx != vq->avail_idx;
>  }
>  
>  /* We don't need to be notified again. */
> diff -ruNp net-next-v0/drivers/vhost/vhost.h net-next-v6/drivers/vhost/vhost.h
> --- net-next-v0/drivers/vhost/vhost.h	2010-04-24 21:37:41.000000000 -0700
> +++ net-next-v6/drivers/vhost/vhost.h	2010-04-26 10:35:25.000000000 -0700
> @@ -84,7 +84,9 @@ struct vhost_virtqueue {
>  	struct iovec indirect[VHOST_NET_MAX_SG];
>  	struct iovec iov[VHOST_NET_MAX_SG];
>  	struct iovec hdr[VHOST_NET_MAX_SG];
> -	size_t hdr_size;
> +	size_t vhost_hlen;
> +	size_t sock_hlen;
> +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
>  	/* We use a kind of RCU to access private pointer.
>  	 * All readers access it from workqueue, which makes it possible to
>  	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> @@ -120,16 +122,23 @@ long vhost_dev_ioctl(struct vhost_dev *,
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
>  
> -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> +		     int datalen, unsigned int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num);
> +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
>  			   struct iovec iov[], unsigned int iov_count,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> +void vhost_discard_desc(struct vhost_virtqueue *, int);
>  
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> +int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> +		     unsigned count);
>  void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
> -			       unsigned int head, int len);
> +			       unsigned int id, int len);
> +void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
> +			       struct vring_used_elem *heads, unsigned count);
> +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
>  void vhost_disable_notify(struct vhost_virtqueue *);
>  bool vhost_enable_notify(struct vhost_virtqueue *);
>  
> @@ -149,7 +158,8 @@ enum {
>  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>  			 (1 << VHOST_F_LOG_ALL) |
> -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 

^ permalink raw reply

* [PATCH v6] Add mergeable rx buffer support to vhost_net
From: David L Stevens @ 2010-04-26 21:20 UTC (permalink / raw)
  To: mst; +Cc: netdev, virtualization, kvm, rusty

This patch adds mergeable receive buffer support to vhost_net.

					+-DLS

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v6/drivers/vhost/net.c
--- net-next-v0/drivers/vhost/net.c	2010-04-24 21:36:54.000000000 -0700
+++ net-next-v6/drivers/vhost/net.c	2010-04-26 01:13:04.000000000 -0700
@@ -109,7 +109,7 @@ static void handle_tx(struct vhost_net *
 	};
 	size_t len, total_len = 0;
 	int err, wmem;
-	size_t hdr_size;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock)
 		return;
@@ -128,13 +128,13 @@ static void handle_tx(struct vhost_net *
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head = vhost_get_desc(&net->dev, vq, vq->iov,
+				      ARRAY_SIZE(vq->iov),
+				      &out, &in,
+				      NULL, NULL);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
 			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
@@ -155,20 +155,20 @@ static void handle_tx(struct vhost_net *
 			break;
 		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, out);
 		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
@@ -187,12 +187,25 @@ static void handle_tx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }
 
+static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+{
+	struct sk_buff *head;
+	int len = 0;
+
+	lock_sock(sk);
+	head = skb_peek(&sk->sk_receive_queue);
+	if (head)
+		len = head->len + vq->sock_hlen;
+	release_sock(sk);
+	return len;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned head, out, in, log, s;
+	unsigned in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -203,14 +216,14 @@ static void handle_rx(struct vhost_net *
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	struct virtio_net_hdr_mrg_rxbuf hdr = {
+		.hdr.flags = 0,
+		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 
 	size_t len, total_len = 0;
-	int err;
-	size_t hdr_size;
+	int err, headcount, datalen;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
@@ -218,18 +231,19 @@ static void handle_rx(struct vhost_net *
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 vq_log, &log);
+	while ((datalen = vhost_head_len(vq, sock->sk))) {
+		headcount = vhost_get_desc_n(vq, vq->heads,
+					     datalen + vhost_hlen,
+					     &in, vq_log, &log);
+		if (headcount < 0)
+			break;
 		/* OK, now we need to know about added descriptors. */
-		if (head == vq->num) {
+		if (!headcount) {
 			if (unlikely(vhost_enable_notify(vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -241,46 +255,53 @@ static void handle_rx(struct vhost_net *
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (out) {
-			vq_err(vq, "Unexpected descriptor format for RX: "
-			       "out %d, int %d\n",
-			       out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+		/* Skip header. TODO: support TSO. */
+		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		/* TODO: Should check and handle checksum. */
-		if (err > len) {
-			pr_err("Discarded truncated rx packet: "
-			       " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq);
+		if (err != datalen) {
+			pr_err("Discarded rx packet: "
+			       " len %d, expected %zd\n", err, datalen);
+			vhost_discard_desc(vq, headcount);
 			continue;
 		}
 		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
+		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr,
+				     vhost_hlen);
 		if (err) {
 			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 			       vq->iov->iov_base, err);
 			break;
 		}
-		len += hdr_size;
-		vhost_add_used_and_signal(&net->dev, vq, head, len);
+		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
+			struct iovec *iov = vhost_hlen ? vq->hdr : vq->iov;
+
+			if (memcpy_toiovecend(iov, (unsigned char *)&headcount,
+				      offsetof(typeof(hdr), num_buffers),
+				      sizeof(hdr.num_buffers))) {
+				vq_err(vq, "Failed num_buffers write");
+				vhost_discard_desc(vq, headcount);
+				break;
+			}
+		}
+		len += vhost_hlen;
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    headcount);
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, len);
 		total_len += len;
@@ -561,9 +582,24 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
+	size_t vhost_hlen;
+	size_t sock_hlen;
 	int i;
+
+	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+		/* vhost provides vnet_hdr */
+		vhost_hlen = sizeof(struct virtio_net_hdr);
+		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+			vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		sock_hlen = 0;
+	} else {
+		/* socket provides vnet_hdr */
+		vhost_hlen = 0;
+		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+			sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		else
+			sock_hlen = sizeof(struct virtio_net_hdr);
+	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
 	    !vhost_log_access_ok(&n->dev)) {
@@ -574,7 +610,8 @@ static int vhost_net_set_features(struct
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
+		n->vqs[i].vhost_hlen = vhost_hlen;
+		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].mutex);
 	}
 	vhost_net_flush(n);
diff -ruNp net-next-v0/drivers/vhost/vhost.c net-next-v6/drivers/vhost/vhost.c
--- net-next-v0/drivers/vhost/vhost.c	2010-04-22 11:31:57.000000000 -0700
+++ net-next-v6/drivers/vhost/vhost.c	2010-04-26 11:18:58.000000000 -0700
@@ -114,7 +114,8 @@ static void vhost_vq_reset(struct vhost_
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
-	vq->hdr_size = 0;
+	vq->vhost_hlen = 0;
+	vq->sock_hlen = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -861,6 +862,53 @@ static unsigned get_indirect(struct vhos
 	return 0;
 }
 
+/* This is a multi-buffer version of vhost_get_desc
+ * @vq		- the relevant virtqueue
+ * datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		     int datalen, unsigned *iovcount, struct vhost_log *log,
+		     unsigned int *log_num)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	int r;
+
+	while (datalen > 0) {
+		if (headcount >= VHOST_NET_MAX_SG) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
+					      ARRAY_SIZE(vq->iov) - seg, &out,
+					      &in, log, log_num);
+		if (heads[headcount].id == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (out || in <= 0) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		heads[headcount].len = iov_length(vq->iov + seg, in);
+		datalen -= heads[headcount].len;
+		++headcount;
+		seg += in;
+	}
+	*iovcount = seg;
+	return headcount;
+err:
+	vhost_discard_desc(vq, headcount);
+	return r;
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -868,7 +916,7 @@ static unsigned get_indirect(struct vhos
  *
  * This function returns the descriptor number found, or vq->num (which
  * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num)
@@ -986,9 +1034,9 @@ unsigned vhost_get_vq_desc(struct vhost_
 }
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
+void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
 {
-	vq->last_avail_idx--;
+	vq->last_avail_idx -= n;
 }
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
@@ -1033,6 +1081,68 @@ int vhost_add_used(struct vhost_virtqueu
 	return 0;
 }
 
+static void vhost_log_used(struct vhost_virtqueue *vq,
+			   struct vring_used_elem __user *used)
+{
+	/* Make sure data is seen before log. */
+	smp_wmb();
+	/* Log used ring entry write. */
+	log_write(vq->log_base,
+		  vq->log_addr +
+		   ((void __user *)used - (void __user *)vq->used),
+		  sizeof *used);
+	/* Log used index update. */
+	log_write(vq->log_base,
+		  vq->log_addr + offsetof(struct vring_used, idx),
+		  sizeof vq->used->idx);
+	if (vq->log_ctx)
+		eventfd_signal(vq->log_ctx, 1);
+}
+
+static int __vhost_add_used_n(struct vhost_virtqueue *vq,
+			    struct vring_used_elem *heads,
+			    unsigned count)
+{
+	struct vring_used_elem __user *used;
+	int start;
+
+	start = vq->last_used_idx % vq->num;
+	used = vq->used->ring + start;
+	if (copy_to_user(used, heads, count * sizeof *used)) {
+		vq_err(vq, "Failed to write used");
+		return -EFAULT;
+	}
+	/* Make sure buffer is written before we update index. */
+	smp_wmb();
+	if (put_user(vq->last_used_idx + count, &vq->used->idx)) {
+		vq_err(vq, "Failed to increment used idx");
+		return -EFAULT;
+	}
+	if (unlikely(vq->log_used))
+		vhost_log_used(vq, used);
+	vq->last_used_idx += count;
+	return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		     unsigned count)
+{
+	int start, n, r;
+
+	start = vq->last_used_idx % vq->num;
+	n = min(vq->num - start, count);
+	if (n < count) {
+		r = __vhost_add_used_n(vq, heads, n);
+		if (r < 0)
+			return r;
+		heads += n;
+		count -= n;
+	}
+	return __vhost_add_used_n(vq, heads, count);
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
@@ -1062,6 +1172,15 @@ void vhost_add_used_and_signal(struct vh
 	vhost_signal(dev, vq);
 }
 
+/* multi-buffer version of vhost_add_used_and_signal */
+void vhost_add_used_and_signal_n(struct vhost_dev *dev,
+				 struct vhost_virtqueue *vq,
+				 struct vring_used_elem *heads, unsigned count)
+{
+	vhost_add_used_n(vq, heads, count);
+	vhost_signal(dev, vq);
+}
+
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_virtqueue *vq)
 {
@@ -1086,7 +1205,7 @@ bool vhost_enable_notify(struct vhost_vi
 		return false;
 	}
 
-	return avail_idx != vq->last_avail_idx;
+	return avail_idx != vq->avail_idx;
 }
 
 /* We don't need to be notified again. */
diff -ruNp net-next-v0/drivers/vhost/vhost.h net-next-v6/drivers/vhost/vhost.h
--- net-next-v0/drivers/vhost/vhost.h	2010-04-24 21:37:41.000000000 -0700
+++ net-next-v6/drivers/vhost/vhost.h	2010-04-26 10:35:25.000000000 -0700
@@ -84,7 +84,9 @@ struct vhost_virtqueue {
 	struct iovec indirect[VHOST_NET_MAX_SG];
 	struct iovec iov[VHOST_NET_MAX_SG];
 	struct iovec hdr[VHOST_NET_MAX_SG];
-	size_t hdr_size;
+	size_t vhost_hlen;
+	size_t sock_hlen;
+	struct vring_used_elem heads[VHOST_NET_MAX_SG];
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
@@ -120,16 +122,23 @@ long vhost_dev_ioctl(struct vhost_dev *,
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		     int datalen, unsigned int *iovcount, struct vhost_log *log,
+		     unsigned int *log_num);
+unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *);
+void vhost_discard_desc(struct vhost_virtqueue *, int);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int head, int len);
+			       unsigned int id, int len);
+void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
+			       struct vring_used_elem *heads, unsigned count);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_virtqueue *);
 bool vhost_enable_notify(struct vhost_virtqueue *);
 
@@ -149,7 +158,8 @@ enum {
 	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
 			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
 			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)



^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: jamal @ 2010-04-26 21:14 UTC (permalink / raw)
  To: David Miller; +Cc: rick.jones2, therbert, eric.dumazet, netdev
In-Reply-To: <1272316297.8918.49.camel@bigi>

On Mon, 2010-04-26 at 17:11 -0400, jamal wrote:
> I think i did read somewhere on measurement taken
> in backbones showing consistent rise of UDP:TCP ratio (need to search
> my bookmarks). 

This one:
http://www.caida.org/research/traffic-analysis/tcpudpratio/

just read the conclusion.

cheers,
jamal



^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: Rick Jones @ 2010-04-26 21:12 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, netdev
In-Reply-To: <20100426.135305.15235166.davem@davemloft.net>

David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:48:22 -0700
> 
>>Do not confuse explanation with endorsement.
> 
> Ok, fair enough.
> 
> But I don't see even the "other perspective" argument being even
> valid.  Big shops still use UDP and it has to scale.

Preface - I too think it is massively stupid to ignore anything but TCP/IPv4, 
and unwise to ignore IPv6 and so on, but there is a very real reason why one of 
my email signatures reads:

"The road to hell is paved with business decisions"

> Or have they made multicast magically start working with TCP so
> they can us it to do trades on the NASDAQ?

No. How many NIC chips can NASDAQ be expected to move? 0.1%? or even 1% of the 
NIC chip market?

How many more NIC chips are in places where someone says "You sold me on 
iSCSI/FCoE/whatnot, why can't I get 'link-rate'  to/from iSCSI storage/whatnot?!"

The NIC designer is there with his finance guys breathing down his neck shouting 
"ROI Uber Alles!" and "Your budget is only this many monetary units!"  The 
system designers at the system vendors are hearing the same things from their 
own finance guys, have certain schedules, which then has them going to the NIC 
firms, who want to sell chips to the system guys "You have to be ready to ship 
by this date and your chip has to sell for no more than this."

Lather, rinse, repeat a few times and you get compromises on top of compromises.

Sometimes I think it is a wonder any of it actually works at all...

rick jones

^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: jamal @ 2010-04-26 21:11 UTC (permalink / raw)
  To: David Miller; +Cc: rick.jones2, therbert, eric.dumazet, netdev
In-Reply-To: <20100426.134051.232750473.davem@davemloft.net>

On Mon, 2010-04-26 at 13:40 -0700, David Miller wrote:

> Why do you think Eric Dumazet gives a crap about UDP scalability and
> is constantly testing it?  What about VOIP?  H.323, RTP, etc.?

This is big of course. All the SIP/RTP stuff is largely UDP. Most P2P
control is via UDP. I think i did read somewhere on measurement taken
in backbones showing consistent rise of UDP:TCP ratio (need to search
my bookmarks). 
It borders on lunacy to assume TCP only. Time to get an open source NIC
out there and ignore these big players? ;->

cheers,
jamal


^ permalink raw reply

* [PATCH] net/sb1250: setup the pdevice within the soc code
From: Sebastian Andrzej Siewior @ 2010-04-26 21:07 UTC (permalink / raw)
  To: netdev; +Cc: Ralf Baechle
In-Reply-To: <20100426210022.GE27656@linux-mips.org>

doing it within the driver does not look good.
And surely isn't how platform devices were meat to be used.

Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
---
 arch/mips/sibyte/swarm/platform.c |   54 +++++++++++++++++
 drivers/net/sb1250-mac.c          |  120 +------------------------------------
 2 files changed, 55 insertions(+), 119 deletions(-)

diff --git a/arch/mips/sibyte/swarm/platform.c b/arch/mips/sibyte/swarm/platform.c
index 54847fe..0973352 100644
--- a/arch/mips/sibyte/swarm/platform.c
+++ b/arch/mips/sibyte/swarm/platform.c
@@ -83,3 +83,57 @@ static int __init swarm_pata_init(void)
 device_initcall(swarm_pata_init);
 
 #endif /* defined(CONFIG_SIBYTE_SWARM) || defined(CONFIG_SIBYTE_LITTLESUR) */
+
+#define sb1250_dev_struct(num) \
+	static struct resource sb1250_res##num = {		\
+		.name = "SB1250 MAC " __stringify(num),		\
+		.flags = IORESOURCE_MEM,		\
+		.start = A_MAC_CHANNEL_BASE(num),	\
+		.end = A_MAC_CHANNEL_BASE(num + 1) -1,	\
+	};\
+	static struct platform_device sb1250_dev##num = {	\
+		.name = "sb1250-mac",			\
+	.id = num,					\
+	.resource = &sb1250_res##num,			\
+	.num_resources = 1,				\
+	}
+
+sb1250_dev_struct(0);
+sb1250_dev_struct(1);
+sb1250_dev_struct(2);
+sb1250_dev_struct(3);
+
+static struct platform_device *sb1250_devs[] __initdata = {
+	&sb1250_dev0,
+	&sb1250_dev1,
+	&sb1250_dev2,
+	&sb1250_dev3,
+};
+
+static int __init sb1250_device_init(void)
+{
+	int ret;
+
+	/* Set the number of available units based on the SOC type.  */
+	switch (soc_type) {
+	case K_SYS_SOC_TYPE_BCM1250:
+	case K_SYS_SOC_TYPE_BCM1250_ALT:
+		ret = platform_add_devices(sb1250_devs, 3);
+		break;
+	case K_SYS_SOC_TYPE_BCM1120:
+	case K_SYS_SOC_TYPE_BCM1125:
+	case K_SYS_SOC_TYPE_BCM1125H:
+	case K_SYS_SOC_TYPE_BCM1250_ALT2:       /* Hybrid */
+		ret = platform_add_devices(sb1250_devs, 2);
+		break;
+	case K_SYS_SOC_TYPE_BCM1x55:
+	case K_SYS_SOC_TYPE_BCM1x80:
+		ret = platform_add_devices(sb1250_devs, 4);
+		break;
+	default:
+		ret = -ENODEV;
+		break;
+	}
+	return ret;
+}
+device_initcall(sb1250_device_init);
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index fc503a1..459bc59 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -332,7 +332,6 @@ static int sbmac_mii_write(struct mii_bus *bus, int phyaddr, int regidx,
  ********************************************************************* */
 
 static char sbmac_string[] = "sb1250-mac";
-static char sbmac_pretty[] = "SB1250 MAC";
 
 static char sbmac_mdio_string[] = "sb1250-mac-mdio";
 
@@ -2668,114 +2667,6 @@ static int __exit sbmac_remove(struct platform_device *pldev)
 	return 0;
 }
 
-
-static struct platform_device **sbmac_pldev;
-static int sbmac_max_units;
-
-static int __init sbmac_platform_probe_one(int idx)
-{
-	struct platform_device *pldev;
-	struct {
-		struct resource r;
-		char name[strlen(sbmac_pretty) + 4];
-	} *res;
-	int err;
-
-	res = kzalloc(sizeof(*res), GFP_KERNEL);
-	if (!res) {
-		printk(KERN_ERR "%s.%d: unable to allocate memory\n",
-		       sbmac_string, idx);
-		err = -ENOMEM;
-		goto out_err;
-	}
-
-	/*
-	 * This is the base address of the MAC.
-	 */
-	snprintf(res->name, sizeof(res->name), "%s %d", sbmac_pretty, idx);
-	res->r.name = res->name;
-	res->r.flags = IORESOURCE_MEM;
-	res->r.start = A_MAC_CHANNEL_BASE(idx);
-	res->r.end = A_MAC_CHANNEL_BASE(idx + 1) - 1;
-
-	pldev = platform_device_register_simple(sbmac_string, idx, &res->r, 1);
-	if (IS_ERR(pldev)) {
-		printk(KERN_ERR "%s.%d: unable to register platform device\n",
-		       sbmac_string, idx);
-		err = PTR_ERR(pldev);
-		goto out_kfree;
-	}
-
-	if (!pldev->dev.driver) {
-		err = 0;		/* No hardware at this address. */
-		goto out_unregister;
-	}
-
-	sbmac_pldev[idx] = pldev;
-	return 0;
-
-out_unregister:
-	platform_device_unregister(pldev);
-
-out_kfree:
-	kfree(res);
-
-out_err:
-	return err;
-}
-
-static void __init sbmac_platform_probe(void)
-{
-	int i;
-
-	/* Set the number of available units based on the SOC type.  */
-	switch (soc_type) {
-	case K_SYS_SOC_TYPE_BCM1250:
-	case K_SYS_SOC_TYPE_BCM1250_ALT:
-		sbmac_max_units = 3;
-		break;
-	case K_SYS_SOC_TYPE_BCM1120:
-	case K_SYS_SOC_TYPE_BCM1125:
-	case K_SYS_SOC_TYPE_BCM1125H:
-	case K_SYS_SOC_TYPE_BCM1250_ALT2:	/* Hybrid */
-		sbmac_max_units = 2;
-		break;
-	case K_SYS_SOC_TYPE_BCM1x55:
-	case K_SYS_SOC_TYPE_BCM1x80:
-		sbmac_max_units = 4;
-		break;
-	default:
-		return;				/* none */
-	}
-
-	sbmac_pldev = kcalloc(sbmac_max_units, sizeof(*sbmac_pldev),
-			      GFP_KERNEL);
-	if (!sbmac_pldev) {
-		printk(KERN_ERR "%s: unable to allocate memory\n",
-		       sbmac_string);
-		return;
-	}
-
-	/*
-	 * Walk through the Ethernet controllers and find
-	 * those who have their MAC addresses set.
-	 */
-	for (i = 0; i < sbmac_max_units; i++)
-		if (sbmac_platform_probe_one(i))
-			break;
-}
-
-
-static void __exit sbmac_platform_cleanup(void)
-{
-	int i;
-
-	for (i = 0; i < sbmac_max_units; i++)
-		platform_device_unregister(sbmac_pldev[i]);
-	kfree(sbmac_pldev);
-}
-
-
 static struct platform_driver sbmac_driver = {
 	.probe = sbmac_probe,
 	.remove = __exit_p(sbmac_remove),
@@ -2786,20 +2677,11 @@ static struct platform_driver sbmac_driver = {
 
 static int __init sbmac_init_module(void)
 {
-	int err;
-
-	err = platform_driver_register(&sbmac_driver);
-	if (err)
-		return err;
-
-	sbmac_platform_probe();
-
-	return err;
+	return platform_driver_register(&sbmac_driver);
 }
 
 static void __exit sbmac_cleanup_module(void)
 {
-	sbmac_platform_cleanup();
 	platform_driver_unregister(&sbmac_driver);
 }
 
-- 
1.6.6.1

^ permalink raw reply related

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-26 21:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger,
	netdev, Andi Kleen
In-Reply-To: <1272293707.19143.51.camel@edumazet-laptop>

On Mon, 2010-04-26 at 16:55 +0200, Eric Dumazet wrote:

> Another interesting finding:
> 
> - if all packets are received on a single queue, max speed seems to be
> 1.200.000 packets per second on my machine :-(

Well, if any consolation, it is not as bad as sky2 hardware;-> I cant do
more than 750Kpps.
Also, it seems you use VLANS - max pps will be lower than without VLANs
by probably maybe 6-70Kpps (doesnt explain the 1.2Mpps of course).

cheers,
jamal


^ permalink raw reply

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-26 21:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Tom Herbert, Stephen Hemminger,
	netdev
In-Reply-To: <1272290584.19143.43.camel@edumazet-laptop>

On Mon, 2010-04-26 at 16:03 +0200, Eric Dumazet wrote:

> 
> Jamal, I have a Nehalem setup now, and I can see
> _raw_spin_lock_irqsave() abuse is not coming from network tree, but from
> clockevents_notify()

yikes. Thanks Eric - I shouldve been able to figure that one out. But
why is this thing expensive? I will run the test tommorow and see if i
see the same thing. 

cheers,
jamal




^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: Stephen Hemminger @ 2010-04-26 20:58 UTC (permalink / raw)
  To: David Miller; +Cc: rick.jones2, therbert, eric.dumazet, netdev
In-Reply-To: <20100426.134051.232750473.davem@davemloft.net>

On Mon, 26 Apr 2010 13:40:51 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:19:31 -0700
> 
> > As a networking guy I can see why it seems baffling, but stepping out
> > of myself and thinking like the customers with whom I've interacted
> > over the years, it is not baffling at all.
> 
> <sarcasm>
> And hey nobody is using SCTP either, that's right, nobody...
> </sarcasm>
> 
> Look, don't try to defend this abomination of a situation with some
> "customers only use TCP" argument.  It only makes the situation look
> even more absurd.  
> 
> Furthermore, people test system scalability using tools like pktgen,
> which surprise surprise generates streams of UDP packets.  Most
> hardware based scalability testers spew UDP too.
> 
> Everything in the world points to "this toeplitz hash situation is
> stupid an inexcusable."
> 
> If UDP isn't used by anyone, then you tell me why the checksum engines
> of all of these chips can handle them just fine.  Maybe the guy who
> works on the checksum logic blocks doesn't talk to the guy who works
> on the hashing ones?  Maybe the checksum guy can find the ports in a
> UDP packet, but the hashing dude can't locate them?
> 
> What the heck do you think people use for various forms of media
> streaming?  They often use UDP and it has to scale, and they'd like to
> move to DCCP at some point too which is another argument for a fully
> protocol agnostic hash.
> 
> Why do you think Eric Dumazet gives a crap about UDP scalability and
> is constantly testing it?  What about VOIP?  H.323, RTP, etc.?
> 
> Some of these cards can even statelessly offload UDP fragmentation
> too, in silicon, not even in firmware.  What's their excuse for
> screwing up the hash?
> 
> Look, this is a complete joke from every angle, at least admit that
> fact.

I think it is fair to blame Microsoft for this as well. The vendors
follow what Msft tells them to do with NDIS spec. It looks like
IPV6 didn't make it in until the NDIS6.2 (Win7) spec.

-- 

^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: David Miller @ 2010-04-26 20:53 UTC (permalink / raw)
  To: rick.jones2; +Cc: therbert, eric.dumazet, netdev
In-Reply-To: <4BD5FC16.4070402@hp.com>

From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 26 Apr 2010 13:48:22 -0700

> Do not confuse explanation with endorsement.

Ok, fair enough.

But I don't see even the "other perspective" argument being even
valid.  Big shops still use UDP and it has to scale.

Or have they made multicast magically start working with TCP so
they can us it to do trades on the NASDAQ?

^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: Rick Jones @ 2010-04-26 20:48 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, netdev
In-Reply-To: <20100426.134051.232750473.davem@davemloft.net>

David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 26 Apr 2010 13:19:31 -0700
> 
> 
>>As a networking guy I can see why it seems baffling, but stepping out
>>of myself and thinking like the customers with whom I've interacted
>>over the years, it is not baffling at all.
> 
> 
> <sarcasm>
> And hey nobody is using SCTP either, that's right, nobody...
> </sarcasm>
> 
> Look, don't try to defend this abomination of a situation with some
> "customers only use TCP" argument.  It only makes the situation look
> even more absurd.  

Do not confuse explanation with endorsement.

rick

^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: David Miller @ 2010-04-26 20:40 UTC (permalink / raw)
  To: rick.jones2; +Cc: therbert, eric.dumazet, netdev
In-Reply-To: <4BD5F553.6020006@hp.com>

From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 26 Apr 2010 13:19:31 -0700

> As a networking guy I can see why it seems baffling, but stepping out
> of myself and thinking like the customers with whom I've interacted
> over the years, it is not baffling at all.

<sarcasm>
And hey nobody is using SCTP either, that's right, nobody...
</sarcasm>

Look, don't try to defend this abomination of a situation with some
"customers only use TCP" argument.  It only makes the situation look
even more absurd.  

Furthermore, people test system scalability using tools like pktgen,
which surprise surprise generates streams of UDP packets.  Most
hardware based scalability testers spew UDP too.

Everything in the world points to "this toeplitz hash situation is
stupid an inexcusable."

If UDP isn't used by anyone, then you tell me why the checksum engines
of all of these chips can handle them just fine.  Maybe the guy who
works on the checksum logic blocks doesn't talk to the guy who works
on the hashing ones?  Maybe the checksum guy can find the ports in a
UDP packet, but the hashing dude can't locate them?

What the heck do you think people use for various forms of media
streaming?  They often use UDP and it has to scale, and they'd like to
move to DCCP at some point too which is another argument for a fully
protocol agnostic hash.

Why do you think Eric Dumazet gives a crap about UDP scalability and
is constantly testing it?  What about VOIP?  H.323, RTP, etc.?

Some of these cards can even statelessly offload UDP fragmentation
too, in silicon, not even in firmware.  What's their excuse for
screwing up the hash?

Look, this is a complete joke from every angle, at least admit that
fact.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 1/2] Add ndo_set_vf_port_profile
From: David Miller @ 2010-04-26 20:25 UTC (permalink / raw)
  To: scofeldm; +Cc: netdev, chrisw, arnd
In-Reply-To: <C7FB3E22.2BAEE%scofeldm@cisco.com>

From: Scott Feldman <scofeldm@cisco.com>
Date: Mon, 26 Apr 2010 12:57:06 -0700

> Hmmm....even that isn't so nice because the port-profile info for all VFs is
> going to get stuffed into the RTM_GETLINK skb, and I assume there are limits
> on the skb return size.

It's probably better to use .dump() for this, which allows to return
the result in multiple SKBs.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 1/2] Add ndo_set_vf_port_profile
From: David Miller @ 2010-04-26 20:24 UTC (permalink / raw)
  To: scofeldm; +Cc: netdev, chrisw, arnd
In-Reply-To: <C7FB3747.2BAAA%scofeldm@cisco.com>

From: Scott Feldman <scofeldm@cisco.com>
Date: Mon, 26 Apr 2010 12:27:51 -0700

> We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
> element for each VF.  That seems like a lot of mem hanging off of netdev,
> and we'd have to define a MAX_VF to size the array.  How about a
> ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
> data in it's private area?  That's how ndo_get_vf_config() is working.

Sure if the device can do it, but for situations where it can't we can
use a linked list and dynamic memory allocation.

^ permalink raw reply

* Re: [PATCH] bnx2x: add support for receive hashing
From: Rick Jones @ 2010-04-26 20:19 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, netdev
In-Reply-To: <20100426.112244.260086869.davem@davemloft.net>

David Miller wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Mon, 26 Apr 2010 11:19:05 -0700
> 
> 
>>This also hits RSS/multiqueue. In a netperf RR test, 500 streams
>>between my two 16 core AMDs:  TCP 970K tps, UDP 370K tps.  I'm
>>surprised they didn't catch that in some benchmarks...
> 
> 
> Meanwhile, these NIC vendors seem to have all the time in the world to
> add iSCSI, RDMA and all the other stateful offload junk into their
> firmware and silicon.
> 
> Yet they can't hash ports if the protocol is not TCP?  Beyond
> baffling...

As a networking guy I can see why it seems baffling, but stepping out of myself 
and thinking like the customers with whom I've interacted over the years, it is 
not baffling at all.

By and large, customers do not do anything "substantial" with UDP.  NFS went to 
TCP mounts 99 times out of 10 many years ago, leaving DNS as about the only 
thing left*. So, customers will not be chomping at the bit for improved UDP 
scalability/performance.  They would though, be jumping up and down demanding 
iSCSI performance and by implication all that comes along for the ride.

rick jones

* And even there, one of the biggest pushes is trying to make TCP "transaction 
friendly" to deal with DNS messages becoming larger than typical MTUs.

^ permalink raw reply

* Re: [RFC][PATCH v4 01/18] Add a new struct for device to manipulate external buffer.
From: Andy Fleming @ 2010-04-26 20:06 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, jdike
In-Reply-To: <1272187206-18534-1-git-send-email-xiaohui.xin@intel.com>

On Sun, Apr 25, 2010 at 4:19 AM,  <xiaohui.xin@intel.com> wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
>
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81@gmail.com>
> Reviewed-by: Jeff Dike <jdike@linux.intel.com>
> ---
>  include/linux/netdevice.h |   19 ++++++++++++++++++-
>  1 files changed, 18 insertions(+), 1 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c79a88b..bf79756 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -530,6 +530,22 @@ struct netdev_queue {
>        unsigned long           tx_dropped;
>  } ____cacheline_aligned_in_smp;
>
> +/* Add a structure in structure net_device, the new field is
> + * named as mp_port. It's for mediate passthru (zero-copy).
> + * It contains the capability for the net device driver,
> + * a socket, and an external buffer creator, external means
> + * skb buffer belongs to the device may not be allocated from
> + * kernel space.
> + */
> +struct mpassthru_port  {
> +       int             hdr_len;
> +       int             data_len;
> +       int             npages;
> +       unsigned        flags;
> +       struct socket   *sock;
> +       struct skb_external_page *(*ctor)(struct mpassthru_port *,
> +                               struct sk_buff *, int);
> +};


I tried searching around, but couldn't find where struct
skb_external_page is declared.  Where is it?

Andy

^ permalink raw reply

* Re: [net-next-2.6 PATCH 1/2] Add ndo_set_vf_port_profile
From: Scott Feldman @ 2010-04-26 19:57 UTC (permalink / raw)
  To: Scott Feldman, David Miller; +Cc: netdev, chrisw, arnd
In-Reply-To: <C7FB3747.2BAAA%scofeldm@cisco.com>

On 4/26/10 12:27 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
> On 4/24/10 12:19 AM, "David Miller" <davem@davemloft.net> wrote:
>> Also, I think it's reasonable to fetch the current profile in use, so
>> GETLINK ought to report these things.  To make it generic we can
>> maintain the settings given to us in software, hung off of the netdev
>> struct, and simply report that during GETLINK.
> 
> We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
> element for each VF.  That seems like a lot of mem hanging off of netdev,
> and we'd have to define a MAX_VF to size the array.  How about a
> ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
> data in it's private area?  That's how ndo_get_vf_config() is working.

Hmmm....even that isn't so nice because the port-profile info for all VFs is
going to get stuffed into the RTM_GETLINK skb, and I assume there are limits
on the skb return size.

Here's a proposal:

Currently we have RTM_GETLINK for

    ip link show [ DEVICE ]

This dumps everything for the DEVICE including info for each VF.  Let's
modify RTM_GETLINK to look like this:

    ip link show [ DEVICE [ vf NUM] ]

If you don't give the optional vf NUM you get base dump on DEVICE.  If you
give vf NUM, you get the VF-specific dump.  This scales much better with the
number of VFs.  

(Number of VFs can easily be > 128 in some designs).

Comments?

-scott


^ permalink raw reply

* Re: [net-next-2.6 PATCH 1/2] Add ndo_set_vf_port_profile
From: Scott Feldman @ 2010-04-26 19:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, chrisw, arnd
In-Reply-To: <20100424.001934.189691704.davem@davemloft.net>

On 4/24/10 12:19 AM, "David Miller" <davem@davemloft.net> wrote:

>> int   (*ndo_set_vf_tx_rate)(struct net_device *dev,
>>      int vf, int rate);
>> + int   (*ndo_set_vf_port_profile)(
>> +     struct net_device *dev, int vf,
>> +     u8 *port_profile, u8 *mac,
>> +     u8 *host_uuid,
>> +     u8 *client_uuid,
>> +     u8 *client_name);
>> int   (*ndo_get_vf_config)(struct net_device *dev,
> 
> Just pass the "struct ifla_vf_port_profile *" instead of tons of
> arguments.

Ok
 
> Also, I think it's reasonable to fetch the current profile in use, so
> GETLINK ought to report these things.  To make it generic we can
> maintain the settings given to us in software, hung off of the netdev
> struct, and simply report that during GETLINK.

We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
element for each VF.  That seems like a lot of mem hanging off of netdev,
and we'd have to define a MAX_VF to size the array.  How about a
ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
data in it's private area?  That's how ndo_get_vf_config() is working.
 
-scott


^ permalink raw reply

* Re: [PATCH net-next-2.6] pppoe: use phonet_pernet instead of directly net_generic
From: Jiri Pirko @ 2010-04-26 18:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100426.111747.115930051.davem@davemloft.net>

Mon, Apr 26, 2010 at 08:17:47PM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jpirko@redhat.com>
>Date: Mon, 26 Apr 2010 15:41:00 +0200
>
>> As in for example pppoe introduce phonet_pernet and use it instead of calling
>> net_generic directly.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>Applied, and I modified your commit header line slightly.  This isn't
>a change to "pppoe: " but rather "phonet: " :-)

Ups, copy/paste mistake. Thanks.
>
>Thanks.

^ permalink raw reply

* Re: [PATCH] RCU: don't turn off lockdep when find suspicious rcu_dereference_check() usage
From: Eric W. Biederman @ 2010-04-26 18:35 UTC (permalink / raw)
  To: paulmck
  Cc: Miles Lane, Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, eric.dumazet, netdev, Jens Axboe,
	Gui Jianfeng, Li Zefan, Johannes Berg
In-Reply-To: <20100426160925.GD2529@linux.vnet.ibm.com>

"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> writes:

> Eric Dumazet traced these down to a commit from Eric Biederman.
>
> If I don't hear from Eric Biederman in a few days, I will attempt a
> patch, but it would be more likely to be correct coming from someone
> with a better understanding of the code.  ;-)

I already replied.

http://lkml.org/lkml/2010/4/21/420

Eric

^ permalink raw reply

* Re: [patch v2] bluetooth: handle l2cap_create_connless_pdu() errors
From: Gustavo F. Padovan @ 2010-04-26 18:27 UTC (permalink / raw)
  To: David Miller
  Cc: error27-Re5JQEeQqe8AvxtiuMwx3w, marcel-kz+m5ild9QBg9hUCZPvPmw,
	andrei.emeltchenko-xNZwKgViW5gAvxtiuMwx3w,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20100426.111259.112594696.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

Hi David,

* David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> [2010-04-26 11:12:59 -0700]:

> From: "Gustavo F. Padovan" <gustavo-THi1TnShQwVAfugRpC6u6w@public.gmane.org>
> Date: Mon, 26 Apr 2010 12:09:19 -0300
> 
> > Hi Dan,
> > 
> > * Dan Carpenter <error27-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> [2010-04-26 13:36:27 +0200]:
> > 
> >> l2cap_create_connless_pdu() can sometimes return ERR_PTR(-ENOMEM) or
> >> ERR_PTR(-EFAULT).
> >> 
> >> Signed-off-by: Dan Carpenter <error27-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> >> ---
> >> In v2 I wrote the patch on top of Gustavo Padovon's devel tree
> 
> This is the kind of bug that could cause a crash if the path actually
> executes.
> 
> Therefore it tires me that that submitter was told to regenerate this
> patch against some devel tree that is -next bound, when in fact this
> is the kind of fix that warrants inclusion right now into net-2.6

My bad here. So I think we should pick the first version of the Dan's patch. 
It applies against bluetooth-testing right now and then against net-2.6 too.

Marcel, is that ok to you?

> 
> Marcel, please do whatever magic you need to so I can get this into
> Linus's tree as I did the rest of the ERR_PTR() fixes from Dan already.
> No reason to treat Bluetooth special and defer these fixes to -next.
> 
> Thanks.

-- 
Gustavo F. Padovan
http://padovan.org

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox