netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
@ 2013-10-28 22:44 Michael Dalton
  2013-10-28 23:19 ` Eric Dumazet
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Michael Dalton @ 2013-10-28 22:44 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Daniel Borkmann, virtualization, Michael Dalton

The virtio_net driver's mergeable receive buffer allocator
uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
is > 4KB but only ~1500 bytes of the buffer is used to store
packet data, reducing the effective TCP window size
substantially. This patch addresses the performance concerns
with mergeable receive buffers by allocating MTU-sized packet
buffers using page frag allocators. If more than MAX_SKB_FRAGS
buffers are needed, the SKB frag_list is used.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 drivers/net/virtio_net.c | 164 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 106 insertions(+), 58 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9fbdfcd..113ee93 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -124,6 +124,11 @@ struct virtnet_info {
 	/* Lock for config space updates */
 	struct mutex config_lock;
 
+	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
+	 * low on memory.
+	 */
+	struct page_frag alloc_frag;
+
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
 
@@ -217,33 +222,18 @@ static void skb_xmit_done(struct virtqueue *vq)
 	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
-static void set_skb_frag(struct sk_buff *skb, struct page *page,
-			 unsigned int offset, unsigned int *len)
-{
-	int size = min((unsigned)PAGE_SIZE - offset, *len);
-	int i = skb_shinfo(skb)->nr_frags;
-
-	__skb_fill_page_desc(skb, i, page, offset, size);
-
-	skb->data_len += size;
-	skb->len += size;
-	skb->truesize += PAGE_SIZE;
-	skb_shinfo(skb)->nr_frags++;
-	skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	*len -= size;
-}
-
 /* Called from bottom half context */
 static struct sk_buff *page_to_skb(struct receive_queue *rq,
-				   struct page *page, unsigned int len)
+				   struct page *page, unsigned int offset,
+				   unsigned int len, unsigned int truesize)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
-	unsigned int copy, hdr_len, offset;
+	unsigned int copy, hdr_len, hdr_padded_len;
 	char *p;
 
-	p = page_address(page);
+	p = page_address(page) + offset;
 
 	/* copy small packet so we can reuse these pages for small data */
 	skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
@@ -254,16 +244,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 
 	if (vi->mergeable_rx_bufs) {
 		hdr_len = sizeof hdr->mhdr;
-		offset = hdr_len;
+		hdr_padded_len = sizeof hdr->mhdr;
 	} else {
 		hdr_len = sizeof hdr->hdr;
-		offset = sizeof(struct padded_vnet_hdr);
+		hdr_padded_len = sizeof(struct padded_vnet_hdr);
 	}
 
 	memcpy(hdr, p, hdr_len);
 
 	len -= hdr_len;
-	p += offset;
+	offset += hdr_padded_len;
+	p += hdr_padded_len;
 
 	copy = len;
 	if (copy > skb_tailroom(skb))
@@ -273,6 +264,14 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 	len -= copy;
 	offset += copy;
 
+	if (vi->mergeable_rx_bufs) {
+		if (len)
+			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
+		else
+			put_page(page);
+		return skb;
+	}
+
 	/*
 	 * Verify that we can indeed put this data into a skb.
 	 * This is here to handle cases when the device erroneously
@@ -284,9 +283,12 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 		dev_kfree_skb(skb);
 		return NULL;
 	}
-
+	BUG_ON(offset >= PAGE_SIZE);
 	while (len) {
-		set_skb_frag(skb, page, offset, &len);
+		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
+				frag_size, truesize);
+		len -= frag_size;
 		page = (struct page *)page->private;
 		offset = 0;
 	}
@@ -297,33 +299,52 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 	return skb;
 }
 
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 {
-	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
+	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
+	struct sk_buff *curr_skb = head_skb;
+	char *buf;
 	struct page *page;
-	int num_buf, i, len;
+	int num_buf, len;
 
 	num_buf = hdr->mhdr.num_buffers;
 	while (--num_buf) {
-		i = skb_shinfo(skb)->nr_frags;
-		if (i >= MAX_SKB_FRAGS) {
-			pr_debug("%s: packet too long\n", skb->dev->name);
-			skb->dev->stats.rx_length_errors++;
-			return -EINVAL;
-		}
-		page = virtqueue_get_buf(rq->vq, &len);
-		if (!page) {
+		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
+		buf = virtqueue_get_buf(rq->vq, &len);
+		if (unlikely(!buf)) {
 			pr_debug("%s: rx error: %d buffers missing\n",
-				 skb->dev->name, hdr->mhdr.num_buffers);
-			skb->dev->stats.rx_length_errors++;
+				 head_skb->dev->name, hdr->mhdr.num_buffers);
+			head_skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-
-		if (len > PAGE_SIZE)
-			len = PAGE_SIZE;
-
-		set_skb_frag(skb, page, 0, &len);
-
+		if (unlikely(len > MAX_PACKET_LEN)) {
+			pr_debug("%s: rx error: merge buffer too long\n",
+				 head_skb->dev->name);
+			len = MAX_PACKET_LEN;
+		}
+		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
+			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
+			if (unlikely(!nskb)) {
+				head_skb->dev->stats.rx_dropped++;
+				return -ENOMEM;
+			}
+			if (curr_skb == head_skb)
+				skb_shinfo(curr_skb)->frag_list = nskb;
+			else
+				curr_skb->next = nskb;
+			curr_skb = nskb;
+			head_skb->truesize += nskb->truesize;
+			num_skb_frags = 0;
+		}
+		if (curr_skb != head_skb) {
+			head_skb->data_len += len;
+			head_skb->len += len;
+			head_skb->truesize += MAX_PACKET_LEN;
+		}
+		page = virt_to_head_page(buf);
+		skb_add_rx_frag(curr_skb, num_skb_frags, page,
+				buf - (char *)page_address(page), len,
+				MAX_PACKET_LEN);
 		--rq->num;
 	}
 	return 0;
@@ -341,8 +362,10 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
-		if (vi->mergeable_rx_bufs || vi->big_packets)
+		if (vi->big_packets)
 			give_pages(rq, buf);
+		else if (vi->mergeable_rx_bufs)
+			put_page(virt_to_head_page(buf));
 		else
 			dev_kfree_skb(buf);
 		return;
@@ -352,19 +375,28 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		skb = buf;
 		len -= sizeof(struct virtio_net_hdr);
 		skb_trim(skb, len);
+	} else if (vi->mergeable_rx_bufs) {
+		struct page *page = virt_to_head_page(buf);
+		skb = page_to_skb(rq, page,
+				  (char *)buf - (char *)page_address(page),
+				  len, MAX_PACKET_LEN);
+		if (unlikely(!skb)) {
+			dev->stats.rx_dropped++;
+			put_page(page);
+			return;
+		}
+		if (receive_mergeable(rq, skb)) {
+			dev_kfree_skb(skb);
+			return;
+		}
 	} else {
 		page = buf;
-		skb = page_to_skb(rq, page, len);
+		skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
 			give_pages(rq, page);
 			return;
 		}
-		if (vi->mergeable_rx_bufs)
-			if (receive_mergeable(rq, skb)) {
-				dev_kfree_skb(skb);
-				return;
-			}
 	}
 
 	hdr = skb_vnet_hdr(skb);
@@ -501,18 +533,28 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
-	struct page *page;
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	char *buf = NULL;
 	int err;
 
-	page = get_a_page(rq, gfp);
-	if (!page)
+	if (gfp & __GFP_WAIT) {
+		if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
+					 gfp)) {
+			buf = (char *)page_address(vi->alloc_frag.page) +
+			      vi->alloc_frag.offset;
+			get_page(vi->alloc_frag.page);
+			vi->alloc_frag.offset += MAX_PACKET_LEN;
+		}
+	} else {
+		buf = netdev_alloc_frag(MAX_PACKET_LEN);
+	}
+	if (!buf)
 		return -ENOMEM;
 
-	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
-
-	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
+	sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
-		give_pages(rq, page);
+		put_page(virt_to_head_page(buf));
 
 	return err;
 }
@@ -1343,8 +1385,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
 		struct virtqueue *vq = vi->rq[i].vq;
 
 		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-			if (vi->mergeable_rx_bufs || vi->big_packets)
+			if (vi->big_packets)
 				give_pages(&vi->rq[i], buf);
+			else if (vi->mergeable_rx_bufs)
+				put_page(virt_to_head_page(buf));
 			else
 				dev_kfree_skb(buf);
 			--vi->rq[i].num;
@@ -1650,6 +1694,8 @@ free_recv_bufs:
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
+	if (vi->alloc_frag.page)
+		put_page(vi->alloc_frag.page);
 free_index:
 	free_percpu(vi->vq_index);
 free_stats:
@@ -1685,6 +1731,8 @@ static void virtnet_remove(struct virtio_device *vdev)
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
+	if (vi->alloc_frag.page)
+		put_page(vi->alloc_frag.page);
 
 	flush_work(&vi->config_work);
 
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 22:44 [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators Michael Dalton
@ 2013-10-28 23:19 ` Eric Dumazet
  2013-10-29  3:57   ` David Miller
  2013-10-29  7:30   ` Daniel Borkmann
  2013-10-29  1:51 ` Rusty Russell
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2013-10-28 23:19 UTC (permalink / raw)
  To: Michael Dalton
  Cc: Francesco Fusco, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

On Mon, 2013-10-28 at 15:44 -0700, Michael Dalton wrote:
> The virtio_net driver's mergeable receive buffer allocator
> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> is > 4KB but only ~1500 bytes of the buffer is used to store
> packet data, reducing the effective TCP window size
> substantially. This patch addresses the performance concerns
> with mergeable receive buffers by allocating MTU-sized packet
> buffers using page frag allocators. If more than MAX_SKB_FRAGS
> buffers are needed, the SKB frag_list is used.
> 
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---

Signed-off-by: Eric Dumazet <edumazet@google.com>

Daniel & Francesco, this should address the performance problem you
tried to address with ("tcp: rcvbuf autotuning improvements") 

( http://www.spinics.net/lists/netdev/msg252642.html )

Thanks !

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 22:44 [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators Michael Dalton
  2013-10-28 23:19 ` Eric Dumazet
@ 2013-10-29  1:51 ` Rusty Russell
  2013-10-29  6:27 ` Jason Wang
  2013-10-29 18:44 ` Eric Northup
  3 siblings, 0 replies; 12+ messages in thread
From: Rusty Russell @ 2013-10-29  1:51 UTC (permalink / raw)
  To: Michael Dalton, David S. Miller
  Cc: netdev, Eric Dumazet, Michael S. Tsirkin, Daniel Borkmann,
	virtualization, Michael Dalton

Michael Dalton <mwdalton@google.com> writes:
> The virtio_net driver's mergeable receive buffer allocator
> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> is > 4KB but only ~1500 bytes of the buffer is used to store
> packet data, reducing the effective TCP window size
> substantially. This patch addresses the performance concerns
> with mergeable receive buffers by allocating MTU-sized packet
> buffers using page frag allocators. If more than MAX_SKB_FRAGS
> buffers are needed, the SKB frag_list is used.
>
> Signed-off-by: Michael Dalton <mwdalton@google.com>

Hi Michael,

        Nice work!  Just one comment.  Your patch highlights the
anachronistic name MAX_PACKET_LEN: it was from the original
implementation which only supported 1500 byte packets, and only used in
one place.

Please apply a first patch like this, then come up with a new constant
name (GOOD_PACKET_LEN?) for that value.  Because it's not the maximum
packet we can receive for mergable buffers.

Thanks,
Rusty.

Subject: virtio_net: remove anachronistic MAX_PACKET_LEN constant.
From: Rusty Russell <rusty@rustcorp.com.au>

The initial implementation of virtio_net only allowed ethernet-style
MTU packets; with more recent features this isn't true.  Move the
constant into the function where it's used.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 057ea13..dcbfccd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -35,8 +35,6 @@ static bool csum = true, gso = true;
 module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 
-/* FIXME: MTU in config. */
-#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
 #define VIRTNET_DRIVER_VERSION "1.0.0"
@@ -434,12 +432,13 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
+	const int len = ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN;
 
-	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
+	skb = __netdev_alloc_skb_ip_align(vi->dev, len, gfp);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
-	skb_put(skb, MAX_PACKET_LEN);
+	skb_put(skb, len);
 
 	hdr = skb_vnet_hdr(skb);
 	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 23:19 ` Eric Dumazet
@ 2013-10-29  3:57   ` David Miller
  2013-10-29 19:12     ` Michael S. Tsirkin
  2013-10-29  7:30   ` Daniel Borkmann
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2013-10-29  3:57 UTC (permalink / raw)
  To: eric.dumazet
  Cc: ffusco, mwdalton, mst, netdev, dborkman, virtualization, edumazet

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 28 Oct 2013 16:19:49 -0700

> On Mon, 2013-10-28 at 15:44 -0700, Michael Dalton wrote:
>> The virtio_net driver's mergeable receive buffer allocator
>> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
>> is > 4KB but only ~1500 bytes of the buffer is used to store
>> packet data, reducing the effective TCP window size
>> substantially. This patch addresses the performance concerns
>> with mergeable receive buffers by allocating MTU-sized packet
>> buffers using page frag allocators. If more than MAX_SKB_FRAGS
>> buffers are needed, the SKB frag_list is used.
>> 
>> Signed-off-by: Michael Dalton <mwdalton@google.com>
>> ---
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> 
> Daniel & Francesco, this should address the performance problem you
> tried to address with ("tcp: rcvbuf autotuning improvements") 
> 
> ( http://www.spinics.net/lists/netdev/msg252642.html )

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 22:44 [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators Michael Dalton
  2013-10-28 23:19 ` Eric Dumazet
  2013-10-29  1:51 ` Rusty Russell
@ 2013-10-29  6:27 ` Jason Wang
  2013-10-29 18:44 ` Eric Northup
  3 siblings, 0 replies; 12+ messages in thread
From: Jason Wang @ 2013-10-29  6:27 UTC (permalink / raw)
  To: Michael Dalton, David S. Miller
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet

On 10/29/2013 06:44 AM, Michael Dalton wrote:
> The virtio_net driver's mergeable receive buffer allocator
> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> is > 4KB but only ~1500 bytes of the buffer is used to store
> packet data, reducing the effective TCP window size
> substantially. This patch addresses the performance concerns
> with mergeable receive buffers by allocating MTU-sized packet
> buffers using page frag allocators. If more than MAX_SKB_FRAGS
> buffers are needed, the SKB frag_list is used.
>
> Signed-off-by: Michael Dalton <mwdalton@google.com>

Do you have any perf numberf of this patch? Have a glance of the patch,
looks like it will hurt the performance of large GSO packet. Always
allocating 1500 bytes mean a virtqueue can only hold about 4 full size
gso packets, and frag list will introduce extra overheads on skb
allocation. I just test it on my desktop, performance of full size gso
packet drops about 10%.

Mergeable buffer is a good balance between performance and the space it
occupies. If you just want a ~1500 bytes packet to be received, you can
just disable the mergeable buffer and just use small packet.

Alternatively, a simpler way is just use build_skb() and head frag to
build the skb directly on the page for medium size packets (small
packets were still copied) like following patch (may need more work
since vnet header is too short for NET_SKB_PAD). This can reduce the
truesize while keep the performance for large GSO packet.

---
 drivers/net/virtio_net.c |   48
++++++++++++++++++++++++++++++---------------
 1 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3c23fdc..4c7ad89 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -239,16 +239,12 @@ static struct sk_buff *page_to_skb(struct
receive_queue *rq,
     struct skb_vnet_hdr *hdr;
     unsigned int copy, hdr_len, offset;
     char *p;
+    int skb_size = SKB_DATA_ALIGN(len) +
+               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+    bool frag;
 
     p = page_address(page);
 
-    /* copy small packet so we can reuse these pages for small data */
-    skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
-    if (unlikely(!skb))
-        return NULL;
-
-    hdr = skb_vnet_hdr(skb);
-
     if (vi->mergeable_rx_bufs) {
         hdr_len = sizeof hdr->mhdr;
         offset = hdr_len;
@@ -257,18 +253,38 @@ static struct sk_buff *page_to_skb(struct
receive_queue *rq,
         offset = sizeof(struct padded_vnet_hdr);
     }
 
-    memcpy(hdr, p, hdr_len);
-
     len -= hdr_len;
     p += offset;
 
-    copy = len;
-    if (copy > skb_tailroom(skb))
-        copy = skb_tailroom(skb);
-    memcpy(skb_put(skb, copy), p, copy);
+    frag = (len > GOOD_COPY_LEN) && (skb_size <= PAGE_SIZE) &&
+        vi->mergeable_rx_bufs;
+    if (frag) {
+        skb = build_skb(page_address(page), skb_size);
+        if (unlikely(!skb))
+            return NULL;
+
+        skb_reserve(skb, offset);
+        skb_put(skb, len);
+        len = 0;
+    } else {
+        /* copy small packet so we can reuse these pages for small data
+         */
+        skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
+        if (unlikely(!skb))
+            return NULL;
+
+        copy = len;
+        if (copy > skb_tailroom(skb))
+            copy = skb_tailroom(skb);
+        memcpy(skb_put(skb, copy), p, copy);
+
+        len -= copy;
+        offset += copy;
+    }
+
+    hdr = skb_vnet_hdr(skb);
 
-    len -= copy;
-    offset += copy;
+    memcpy(hdr, page_address(page), hdr_len);
 
     /*
      * Verify that we can indeed put this data into a skb.
@@ -288,7 +304,7 @@ static struct sk_buff *page_to_skb(struct
receive_queue *rq,
         offset = 0;
     }
 
-    if (page)
+    if (page && !frag)
         give_pages(rq, page);
 
     return skb;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 23:19 ` Eric Dumazet
  2013-10-29  3:57   ` David Miller
@ 2013-10-29  7:30   ` Daniel Borkmann
  1 sibling, 0 replies; 12+ messages in thread
From: Daniel Borkmann @ 2013-10-29  7:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Michael Dalton, David S. Miller, netdev, Eric Dumazet,
	Rusty Russell, Michael S. Tsirkin, virtualization,
	Francesco Fusco

On 10/29/2013 12:19 AM, Eric Dumazet wrote:
> On Mon, 2013-10-28 at 15:44 -0700, Michael Dalton wrote:
>> The virtio_net driver's mergeable receive buffer allocator
>> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
>> is > 4KB but only ~1500 bytes of the buffer is used to store
>> packet data, reducing the effective TCP window size
>> substantially. This patch addresses the performance concerns
>> with mergeable receive buffers by allocating MTU-sized packet
>> buffers using page frag allocators. If more than MAX_SKB_FRAGS
>> buffers are needed, the SKB frag_list is used.
>>
>> Signed-off-by: Michael Dalton <mwdalton@google.com>
>> ---
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
>
> Daniel & Francesco, this should address the performance problem you
> tried to address with ("tcp: rcvbuf autotuning improvements")

That's awesome, thanks everyone !

> ( http://www.spinics.net/lists/netdev/msg252642.html )
>
> Thanks !

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-28 22:44 [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators Michael Dalton
                   ` (2 preceding siblings ...)
  2013-10-29  6:27 ` Jason Wang
@ 2013-10-29 18:44 ` Eric Northup
  2013-10-29 19:05   ` Michael Dalton
  2013-10-29 19:05   ` Michael S. Tsirkin
  3 siblings, 2 replies; 12+ messages in thread
From: Eric Northup @ 2013-10-29 18:44 UTC (permalink / raw)
  To: Michael Dalton
  Cc: Michael S. Tsirkin, netdev, Eric Dumazet, lf-virt,
	Daniel Borkmann, David S. Miller

On Mon, Oct 28, 2013 at 10:44 PM, Michael Dalton <mwdalton@google.com> wrote:
> The virtio_net driver's mergeable receive buffer allocator
> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> is > 4KB but only ~1500 bytes of the buffer is used to store
> packet data, reducing the effective TCP window size
> substantially. This patch addresses the performance concerns
> with mergeable receive buffers by allocating MTU-sized packet
> buffers using page frag allocators. If more than MAX_SKB_FRAGS
> buffers are needed, the SKB frag_list is used.

This looks really good, just one nit below -- should we add MTU-sized
packets, or add MTU+sizeof(virtnet_hdr)-sized packets?

Reviewed-by: Eric Northup <digitaleric@google.com>

>
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---
>  drivers/net/virtio_net.c | 164 ++++++++++++++++++++++++++++++-----------------
>  1 file changed, 106 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 9fbdfcd..113ee93 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -124,6 +124,11 @@ struct virtnet_info {
>         /* Lock for config space updates */
>         struct mutex config_lock;
>
> +       /* Page_frag for GFP_KERNEL packet buffer allocation when we run
> +        * low on memory.
> +        */
> +       struct page_frag alloc_frag;
> +
>         /* Does the affinity hint is set for virtqueues? */
>         bool affinity_hint_set;
>
> @@ -217,33 +222,18 @@ static void skb_xmit_done(struct virtqueue *vq)
>         netif_wake_subqueue(vi->dev, vq2txq(vq));
>  }
>
> -static void set_skb_frag(struct sk_buff *skb, struct page *page,
> -                        unsigned int offset, unsigned int *len)
> -{
> -       int size = min((unsigned)PAGE_SIZE - offset, *len);
> -       int i = skb_shinfo(skb)->nr_frags;
> -
> -       __skb_fill_page_desc(skb, i, page, offset, size);
> -
> -       skb->data_len += size;
> -       skb->len += size;
> -       skb->truesize += PAGE_SIZE;
> -       skb_shinfo(skb)->nr_frags++;
> -       skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
> -       *len -= size;
> -}
> -
>  /* Called from bottom half context */
>  static struct sk_buff *page_to_skb(struct receive_queue *rq,
> -                                  struct page *page, unsigned int len)
> +                                  struct page *page, unsigned int offset,
> +                                  unsigned int len, unsigned int truesize)
>  {
>         struct virtnet_info *vi = rq->vq->vdev->priv;
>         struct sk_buff *skb;
>         struct skb_vnet_hdr *hdr;
> -       unsigned int copy, hdr_len, offset;
> +       unsigned int copy, hdr_len, hdr_padded_len;
>         char *p;
>
> -       p = page_address(page);
> +       p = page_address(page) + offset;
>
>         /* copy small packet so we can reuse these pages for small data */
>         skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
> @@ -254,16 +244,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>
>         if (vi->mergeable_rx_bufs) {
>                 hdr_len = sizeof hdr->mhdr;
> -               offset = hdr_len;
> +               hdr_padded_len = sizeof hdr->mhdr;
>         } else {
>                 hdr_len = sizeof hdr->hdr;
> -               offset = sizeof(struct padded_vnet_hdr);
> +               hdr_padded_len = sizeof(struct padded_vnet_hdr);
>         }
>
>         memcpy(hdr, p, hdr_len);
>
>         len -= hdr_len;
> -       p += offset;
> +       offset += hdr_padded_len;
> +       p += hdr_padded_len;
>
>         copy = len;
>         if (copy > skb_tailroom(skb))
> @@ -273,6 +264,14 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>         len -= copy;
>         offset += copy;
>
> +       if (vi->mergeable_rx_bufs) {
> +               if (len)
> +                       skb_add_rx_frag(skb, 0, page, offset, len, truesize);
> +               else
> +                       put_page(page);
> +               return skb;
> +       }
> +
>         /*
>          * Verify that we can indeed put this data into a skb.
>          * This is here to handle cases when the device erroneously
> @@ -284,9 +283,12 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>                 dev_kfree_skb(skb);
>                 return NULL;
>         }
> -
> +       BUG_ON(offset >= PAGE_SIZE);
>         while (len) {
> -               set_skb_frag(skb, page, offset, &len);
> +               unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
> +               skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
> +                               frag_size, truesize);
> +               len -= frag_size;
>                 page = (struct page *)page->private;
>                 offset = 0;
>         }
> @@ -297,33 +299,52 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>         return skb;
>  }
>
> -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
> +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  {
> -       struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> +       struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
> +       struct sk_buff *curr_skb = head_skb;
> +       char *buf;
>         struct page *page;
> -       int num_buf, i, len;
> +       int num_buf, len;
>
>         num_buf = hdr->mhdr.num_buffers;
>         while (--num_buf) {
> -               i = skb_shinfo(skb)->nr_frags;
> -               if (i >= MAX_SKB_FRAGS) {
> -                       pr_debug("%s: packet too long\n", skb->dev->name);
> -                       skb->dev->stats.rx_length_errors++;
> -                       return -EINVAL;
> -               }
> -               page = virtqueue_get_buf(rq->vq, &len);
> -               if (!page) {
> +               int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
> +               buf = virtqueue_get_buf(rq->vq, &len);
> +               if (unlikely(!buf)) {
>                         pr_debug("%s: rx error: %d buffers missing\n",
> -                                skb->dev->name, hdr->mhdr.num_buffers);
> -                       skb->dev->stats.rx_length_errors++;
> +                                head_skb->dev->name, hdr->mhdr.num_buffers);
> +                       head_skb->dev->stats.rx_length_errors++;
>                         return -EINVAL;
>                 }
> -
> -               if (len > PAGE_SIZE)
> -                       len = PAGE_SIZE;
> -
> -               set_skb_frag(skb, page, 0, &len);
> -
> +               if (unlikely(len > MAX_PACKET_LEN)) {
> +                       pr_debug("%s: rx error: merge buffer too long\n",
> +                                head_skb->dev->name);
> +                       len = MAX_PACKET_LEN;
> +               }
> +               if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
> +                       struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> +                       if (unlikely(!nskb)) {
> +                               head_skb->dev->stats.rx_dropped++;
> +                               return -ENOMEM;
> +                       }
> +                       if (curr_skb == head_skb)
> +                               skb_shinfo(curr_skb)->frag_list = nskb;
> +                       else
> +                               curr_skb->next = nskb;
> +                       curr_skb = nskb;
> +                       head_skb->truesize += nskb->truesize;
> +                       num_skb_frags = 0;
> +               }
> +               if (curr_skb != head_skb) {
> +                       head_skb->data_len += len;
> +                       head_skb->len += len;
> +                       head_skb->truesize += MAX_PACKET_LEN;
> +               }
> +               page = virt_to_head_page(buf);
> +               skb_add_rx_frag(curr_skb, num_skb_frags, page,
> +                               buf - (char *)page_address(page), len,
> +                               MAX_PACKET_LEN);
>                 --rq->num;
>         }
>         return 0;
> @@ -341,8 +362,10 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>         if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
>                 pr_debug("%s: short packet %i\n", dev->name, len);
>                 dev->stats.rx_length_errors++;
> -               if (vi->mergeable_rx_bufs || vi->big_packets)
> +               if (vi->big_packets)
>                         give_pages(rq, buf);
> +               else if (vi->mergeable_rx_bufs)
> +                       put_page(virt_to_head_page(buf));
>                 else
>                         dev_kfree_skb(buf);
>                 return;
> @@ -352,19 +375,28 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>                 skb = buf;
>                 len -= sizeof(struct virtio_net_hdr);
>                 skb_trim(skb, len);
> +       } else if (vi->mergeable_rx_bufs) {
> +               struct page *page = virt_to_head_page(buf);
> +               skb = page_to_skb(rq, page,
> +                                 (char *)buf - (char *)page_address(page),
> +                                 len, MAX_PACKET_LEN);
> +               if (unlikely(!skb)) {
> +                       dev->stats.rx_dropped++;
> +                       put_page(page);
> +                       return;
> +               }
> +               if (receive_mergeable(rq, skb)) {
> +                       dev_kfree_skb(skb);
> +                       return;
> +               }
>         } else {
>                 page = buf;
> -               skb = page_to_skb(rq, page, len);
> +               skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);
>                 if (unlikely(!skb)) {
>                         dev->stats.rx_dropped++;
>                         give_pages(rq, page);
>                         return;
>                 }
> -               if (vi->mergeable_rx_bufs)
> -                       if (receive_mergeable(rq, skb)) {
> -                               dev_kfree_skb(skb);
> -                               return;
> -                       }
>         }
>
>         hdr = skb_vnet_hdr(skb);
> @@ -501,18 +533,28 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
>
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
> -       struct page *page;
> +       struct virtnet_info *vi = rq->vq->vdev->priv;
> +       char *buf = NULL;
>         int err;
>
> -       page = get_a_page(rq, gfp);
> -       if (!page)
> +       if (gfp & __GFP_WAIT) {

Shouldn't this be MAX_PACKET_LEN + sizeof(virtio_net_hdr_mrg_rxbuf) ?

That way, we can receive an MTU-sized packet while only consuming a
single queue entry.

> +               if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
> +                                        gfp)) {
> +                       buf = (char *)page_address(vi->alloc_frag.page) +
> +                             vi->alloc_frag.offset;
> +                       get_page(vi->alloc_frag.page);
> +                       vi->alloc_frag.offset += MAX_PACKET_LEN;
> +               }
> +       } else {
> +               buf = netdev_alloc_frag(MAX_PACKET_LEN);
> +       }
> +       if (!buf)
>                 return -ENOMEM;
>
> -       sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
> -
> -       err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
> +       sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
> +       err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>         if (err < 0)
> -               give_pages(rq, page);
> +               put_page(virt_to_head_page(buf));
>
>         return err;
>  }
> @@ -1343,8 +1385,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
>                 struct virtqueue *vq = vi->rq[i].vq;
>
>                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> -                       if (vi->mergeable_rx_bufs || vi->big_packets)
> +                       if (vi->big_packets)
>                                 give_pages(&vi->rq[i], buf);
> +                       else if (vi->mergeable_rx_bufs)
> +                               put_page(virt_to_head_page(buf));
>                         else
>                                 dev_kfree_skb(buf);
>                         --vi->rq[i].num;
> @@ -1650,6 +1694,8 @@ free_recv_bufs:
>  free_vqs:
>         cancel_delayed_work_sync(&vi->refill);
>         virtnet_del_vqs(vi);
> +       if (vi->alloc_frag.page)
> +               put_page(vi->alloc_frag.page);
>  free_index:
>         free_percpu(vi->vq_index);
>  free_stats:
> @@ -1685,6 +1731,8 @@ static void virtnet_remove(struct virtio_device *vdev)
>         unregister_netdev(vi->dev);
>
>         remove_vq_common(vi);
> +       if (vi->alloc_frag.page)
> +               put_page(vi->alloc_frag.page);
>
>         flush_work(&vi->config_work);
>
> --
> 1.8.4.1
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-29 18:44 ` Eric Northup
@ 2013-10-29 19:05   ` Michael Dalton
  2013-10-29 19:17     ` Michael S. Tsirkin
  2013-10-30  4:41     ` Jason Wang
  2013-10-29 19:05   ` Michael S. Tsirkin
  1 sibling, 2 replies; 12+ messages in thread
From: Michael Dalton @ 2013-10-29 19:05 UTC (permalink / raw)
  To: Eric Northup
  Cc: David S. Miller, Michael S. Tsirkin, netdev, Daniel Borkmann,
	lf-virt, Eric Dumazet

Agreed Eric, the buffer size should be increased so that we can accommodate a
MTU-sized packet + mergeable virtio net header in a single buffer. I will send
a patch to fix shortly cleaning up the #define headers as Rusty indicated and
increasing the buffer size slightly by VirtioNet header size bytes per Eric.

Jason, I'll followup with you directly - I'd like to know your exact workload
(single steam or multi-stream netperf?), VM configuration, etc, and also see if
the nit that Erichas pointed out affects your results.  It is also
worth noting that
we may want to tune the queue sizes for your benchmarks, e.g, by reducing
buffer size from 4KB to MTU-sized but keeping queue length constant, we're
implicitly decreasing the number of bytes stored in the VirtioQueue for the
VirtioNet device, so increasing the queue size may help.

Best,

Mike

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-29 18:44 ` Eric Northup
  2013-10-29 19:05   ` Michael Dalton
@ 2013-10-29 19:05   ` Michael S. Tsirkin
  1 sibling, 0 replies; 12+ messages in thread
From: Michael S. Tsirkin @ 2013-10-29 19:05 UTC (permalink / raw)
  To: Eric Northup
  Cc: Michael Dalton, netdev, Eric Dumazet, lf-virt, Daniel Borkmann,
	David S. Miller

On Tue, Oct 29, 2013 at 06:44:40PM +0000, Eric Northup wrote:
> On Mon, Oct 28, 2013 at 10:44 PM, Michael Dalton <mwdalton@google.com> wrote:
> > The virtio_net driver's mergeable receive buffer allocator
> > uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> > is > 4KB but only ~1500 bytes of the buffer is used to store
> > packet data, reducing the effective TCP window size
> > substantially. This patch addresses the performance concerns
> > with mergeable receive buffers by allocating MTU-sized packet
> > buffers using page frag allocators. If more than MAX_SKB_FRAGS
> > buffers are needed, the SKB frag_list is used.
> 
> This looks really good, just one nit below -- should we add MTU-sized
> packets, or add MTU+sizeof(virtnet_hdr)-sized packets?
> 
> Reviewed-by: Eric Northup <digitaleric@google.com>

Yea, except it increases the typical number of packets per TSO
packet by a factor of 2.5, so it's almost sure to hurt performance
for some workloads.

Maybe the right thing to do is to pass a mix of small and big
buffers to the device. Device can pick the right size depending
on the size of the packet it has.

What to do for existing devices? I'm not sure. tcp rcv buf is at
least tunable.  I guess we could choose the buffer size depending on
default rcv buf. And/or maybe a module parameter ...

> >
> > Signed-off-by: Michael Dalton <mwdalton@google.com>
> > ---
> >  drivers/net/virtio_net.c | 164 ++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 106 insertions(+), 58 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 9fbdfcd..113ee93 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -124,6 +124,11 @@ struct virtnet_info {
> >         /* Lock for config space updates */
> >         struct mutex config_lock;
> >
> > +       /* Page_frag for GFP_KERNEL packet buffer allocation when we run
> > +        * low on memory.
> > +        */
> > +       struct page_frag alloc_frag;
> > +
> >         /* Does the affinity hint is set for virtqueues? */
> >         bool affinity_hint_set;
> >
> > @@ -217,33 +222,18 @@ static void skb_xmit_done(struct virtqueue *vq)
> >         netif_wake_subqueue(vi->dev, vq2txq(vq));
> >  }
> >
> > -static void set_skb_frag(struct sk_buff *skb, struct page *page,
> > -                        unsigned int offset, unsigned int *len)
> > -{
> > -       int size = min((unsigned)PAGE_SIZE - offset, *len);
> > -       int i = skb_shinfo(skb)->nr_frags;
> > -
> > -       __skb_fill_page_desc(skb, i, page, offset, size);
> > -
> > -       skb->data_len += size;
> > -       skb->len += size;
> > -       skb->truesize += PAGE_SIZE;
> > -       skb_shinfo(skb)->nr_frags++;
> > -       skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
> > -       *len -= size;
> > -}
> > -
> >  /* Called from bottom half context */
> >  static struct sk_buff *page_to_skb(struct receive_queue *rq,
> > -                                  struct page *page, unsigned int len)
> > +                                  struct page *page, unsigned int offset,
> > +                                  unsigned int len, unsigned int truesize)
> >  {
> >         struct virtnet_info *vi = rq->vq->vdev->priv;
> >         struct sk_buff *skb;
> >         struct skb_vnet_hdr *hdr;
> > -       unsigned int copy, hdr_len, offset;
> > +       unsigned int copy, hdr_len, hdr_padded_len;
> >         char *p;
> >
> > -       p = page_address(page);
> > +       p = page_address(page) + offset;
> >
> >         /* copy small packet so we can reuse these pages for small data */
> >         skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
> > @@ -254,16 +244,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
> >
> >         if (vi->mergeable_rx_bufs) {
> >                 hdr_len = sizeof hdr->mhdr;
> > -               offset = hdr_len;
> > +               hdr_padded_len = sizeof hdr->mhdr;
> >         } else {
> >                 hdr_len = sizeof hdr->hdr;
> > -               offset = sizeof(struct padded_vnet_hdr);
> > +               hdr_padded_len = sizeof(struct padded_vnet_hdr);
> >         }
> >
> >         memcpy(hdr, p, hdr_len);
> >
> >         len -= hdr_len;
> > -       p += offset;
> > +       offset += hdr_padded_len;
> > +       p += hdr_padded_len;
> >
> >         copy = len;
> >         if (copy > skb_tailroom(skb))
> > @@ -273,6 +264,14 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
> >         len -= copy;
> >         offset += copy;
> >
> > +       if (vi->mergeable_rx_bufs) {
> > +               if (len)
> > +                       skb_add_rx_frag(skb, 0, page, offset, len, truesize);
> > +               else
> > +                       put_page(page);
> > +               return skb;
> > +       }
> > +
> >         /*
> >          * Verify that we can indeed put this data into a skb.
> >          * This is here to handle cases when the device erroneously
> > @@ -284,9 +283,12 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
> >                 dev_kfree_skb(skb);
> >                 return NULL;
> >         }
> > -
> > +       BUG_ON(offset >= PAGE_SIZE);
> >         while (len) {
> > -               set_skb_frag(skb, page, offset, &len);
> > +               unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
> > +               skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
> > +                               frag_size, truesize);
> > +               len -= frag_size;
> >                 page = (struct page *)page->private;
> >                 offset = 0;
> >         }
> > @@ -297,33 +299,52 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
> >         return skb;
> >  }
> >
> > -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
> > +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> >  {
> > -       struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> > +       struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
> > +       struct sk_buff *curr_skb = head_skb;
> > +       char *buf;
> >         struct page *page;
> > -       int num_buf, i, len;
> > +       int num_buf, len;
> >
> >         num_buf = hdr->mhdr.num_buffers;
> >         while (--num_buf) {
> > -               i = skb_shinfo(skb)->nr_frags;
> > -               if (i >= MAX_SKB_FRAGS) {
> > -                       pr_debug("%s: packet too long\n", skb->dev->name);
> > -                       skb->dev->stats.rx_length_errors++;
> > -                       return -EINVAL;
> > -               }
> > -               page = virtqueue_get_buf(rq->vq, &len);
> > -               if (!page) {
> > +               int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
> > +               buf = virtqueue_get_buf(rq->vq, &len);
> > +               if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers missing\n",
> > -                                skb->dev->name, hdr->mhdr.num_buffers);
> > -                       skb->dev->stats.rx_length_errors++;
> > +                                head_skb->dev->name, hdr->mhdr.num_buffers);
> > +                       head_skb->dev->stats.rx_length_errors++;
> >                         return -EINVAL;
> >                 }
> > -
> > -               if (len > PAGE_SIZE)
> > -                       len = PAGE_SIZE;
> > -
> > -               set_skb_frag(skb, page, 0, &len);
> > -
> > +               if (unlikely(len > MAX_PACKET_LEN)) {
> > +                       pr_debug("%s: rx error: merge buffer too long\n",
> > +                                head_skb->dev->name);
> > +                       len = MAX_PACKET_LEN;
> > +               }
> > +               if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
> > +                       struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> > +                       if (unlikely(!nskb)) {
> > +                               head_skb->dev->stats.rx_dropped++;
> > +                               return -ENOMEM;
> > +                       }
> > +                       if (curr_skb == head_skb)
> > +                               skb_shinfo(curr_skb)->frag_list = nskb;
> > +                       else
> > +                               curr_skb->next = nskb;
> > +                       curr_skb = nskb;
> > +                       head_skb->truesize += nskb->truesize;
> > +                       num_skb_frags = 0;
> > +               }
> > +               if (curr_skb != head_skb) {
> > +                       head_skb->data_len += len;
> > +                       head_skb->len += len;
> > +                       head_skb->truesize += MAX_PACKET_LEN;
> > +               }
> > +               page = virt_to_head_page(buf);
> > +               skb_add_rx_frag(curr_skb, num_skb_frags, page,
> > +                               buf - (char *)page_address(page), len,
> > +                               MAX_PACKET_LEN);
> >                 --rq->num;
> >         }
> >         return 0;
> > @@ -341,8 +362,10 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
> >         if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
> >                 pr_debug("%s: short packet %i\n", dev->name, len);
> >                 dev->stats.rx_length_errors++;
> > -               if (vi->mergeable_rx_bufs || vi->big_packets)
> > +               if (vi->big_packets)
> >                         give_pages(rq, buf);
> > +               else if (vi->mergeable_rx_bufs)
> > +                       put_page(virt_to_head_page(buf));
> >                 else
> >                         dev_kfree_skb(buf);
> >                 return;
> > @@ -352,19 +375,28 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
> >                 skb = buf;
> >                 len -= sizeof(struct virtio_net_hdr);
> >                 skb_trim(skb, len);
> > +       } else if (vi->mergeable_rx_bufs) {
> > +               struct page *page = virt_to_head_page(buf);
> > +               skb = page_to_skb(rq, page,
> > +                                 (char *)buf - (char *)page_address(page),
> > +                                 len, MAX_PACKET_LEN);
> > +               if (unlikely(!skb)) {
> > +                       dev->stats.rx_dropped++;
> > +                       put_page(page);
> > +                       return;
> > +               }
> > +               if (receive_mergeable(rq, skb)) {
> > +                       dev_kfree_skb(skb);
> > +                       return;
> > +               }
> >         } else {
> >                 page = buf;
> > -               skb = page_to_skb(rq, page, len);
> > +               skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);
> >                 if (unlikely(!skb)) {
> >                         dev->stats.rx_dropped++;
> >                         give_pages(rq, page);
> >                         return;
> >                 }
> > -               if (vi->mergeable_rx_bufs)
> > -                       if (receive_mergeable(rq, skb)) {
> > -                               dev_kfree_skb(skb);
> > -                               return;
> > -                       }
> >         }
> >
> >         hdr = skb_vnet_hdr(skb);
> > @@ -501,18 +533,28 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
> >
> >  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> >  {
> > -       struct page *page;
> > +       struct virtnet_info *vi = rq->vq->vdev->priv;
> > +       char *buf = NULL;
> >         int err;
> >
> > -       page = get_a_page(rq, gfp);
> > -       if (!page)
> > +       if (gfp & __GFP_WAIT) {
> 
> Shouldn't this be MAX_PACKET_LEN + sizeof(virtio_net_hdr_mrg_rxbuf) ?
> 
> That way, we can receive an MTU-sized packet while only consuming a
> single queue entry.
> 
> > +               if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
> > +                                        gfp)) {
> > +                       buf = (char *)page_address(vi->alloc_frag.page) +
> > +                             vi->alloc_frag.offset;
> > +                       get_page(vi->alloc_frag.page);
> > +                       vi->alloc_frag.offset += MAX_PACKET_LEN;
> > +               }
> > +       } else {
> > +               buf = netdev_alloc_frag(MAX_PACKET_LEN);
> > +       }
> > +       if (!buf)
> >                 return -ENOMEM;
> >
> > -       sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
> > -
> > -       err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
> > +       sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
> > +       err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
> >         if (err < 0)
> > -               give_pages(rq, page);
> > +               put_page(virt_to_head_page(buf));
> >
> >         return err;
> >  }
> > @@ -1343,8 +1385,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> >                 struct virtqueue *vq = vi->rq[i].vq;
> >
> >                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> > -                       if (vi->mergeable_rx_bufs || vi->big_packets)
> > +                       if (vi->big_packets)
> >                                 give_pages(&vi->rq[i], buf);
> > +                       else if (vi->mergeable_rx_bufs)
> > +                               put_page(virt_to_head_page(buf));
> >                         else
> >                                 dev_kfree_skb(buf);
> >                         --vi->rq[i].num;
> > @@ -1650,6 +1694,8 @@ free_recv_bufs:
> >  free_vqs:
> >         cancel_delayed_work_sync(&vi->refill);
> >         virtnet_del_vqs(vi);
> > +       if (vi->alloc_frag.page)
> > +               put_page(vi->alloc_frag.page);
> >  free_index:
> >         free_percpu(vi->vq_index);
> >  free_stats:
> > @@ -1685,6 +1731,8 @@ static void virtnet_remove(struct virtio_device *vdev)
> >         unregister_netdev(vi->dev);
> >
> >         remove_vq_common(vi);
> > +       if (vi->alloc_frag.page)
> > +               put_page(vi->alloc_frag.page);
> >
> >         flush_work(&vi->config_work);
> >
> > --
> > 1.8.4.1
> >
> > _______________________________________________
> > Virtualization mailing list
> > Virtualization@lists.linux-foundation.org
> > https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-29  3:57   ` David Miller
@ 2013-10-29 19:12     ` Michael S. Tsirkin
  0 siblings, 0 replies; 12+ messages in thread
From: Michael S. Tsirkin @ 2013-10-29 19:12 UTC (permalink / raw)
  To: David Miller
  Cc: ffusco, mwdalton, eric.dumazet, netdev, dborkman, virtualization,
	edumazet

On Mon, Oct 28, 2013 at 11:57:21PM -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 28 Oct 2013 16:19:49 -0700
> 
> > On Mon, 2013-10-28 at 15:44 -0700, Michael Dalton wrote:
> >> The virtio_net driver's mergeable receive buffer allocator
> >> uses 4KB packet buffers. For MTU-sized traffic, SKB truesize
> >> is > 4KB but only ~1500 bytes of the buffer is used to store
> >> packet data, reducing the effective TCP window size
> >> substantially. This patch addresses the performance concerns
> >> with mergeable receive buffers by allocating MTU-sized packet
> >> buffers using page frag allocators. If more than MAX_SKB_FRAGS
> >> buffers are needed, the SKB frag_list is used.
> >> 
> >> Signed-off-by: Michael Dalton <mwdalton@google.com>
> >> ---
> > 
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > 
> > Daniel & Francesco, this should address the performance problem you
> > tried to address with ("tcp: rcvbuf autotuning improvements") 
> > 
> > ( http://www.spinics.net/lists/netdev/msg252642.html )
> 
> Applied, thanks everyone.

Hmm that was very quick. It *should* fix a bug, great.
But how about we wait for some Tested-by reports that it actually does?

-- 
MST

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-29 19:05   ` Michael Dalton
@ 2013-10-29 19:17     ` Michael S. Tsirkin
  2013-10-30  4:41     ` Jason Wang
  1 sibling, 0 replies; 12+ messages in thread
From: Michael S. Tsirkin @ 2013-10-29 19:17 UTC (permalink / raw)
  To: Michael Dalton
  Cc: netdev, Eric Dumazet, lf-virt, Daniel Borkmann, David S. Miller

On Tue, Oct 29, 2013 at 12:05:27PM -0700, Michael Dalton wrote:
> Agreed Eric, the buffer size should be increased so that we can accommodate a
> MTU-sized packet + mergeable virtio net header in a single buffer. I will send
> a patch to fix shortly cleaning up the #define headers as Rusty indicated and
> increasing the buffer size slightly by VirtioNet header size bytes per Eric.
> 
> Jason, I'll followup with you directly - I'd like to know your exact workload
> (single steam or multi-stream netperf?), VM configuration, etc, and also see if
> the nit that Erichas pointed out affects your results.  It is also
> worth noting that
> we may want to tune the queue sizes for your benchmarks, e.g, by reducing
> buffer size from 4KB to MTU-sized but keeping queue length constant, we're
> implicitly decreasing the number of bytes stored in the VirtioQueue for the
> VirtioNet device, so increasing the queue size may help.
> 
> Best,
> 
> Mike

Well we have 256 descriptors per queue, each descriptor is 16 bytes
already and they have to be physically contigious.
I don't think we can easily increase the queue size much more without
risking memory allocation failures on busy systems.

I guess one approach is to do something like:
	 if (queue size > 1024)
		 use small buffers
	 else
		 use 4K buffers.

That would reduce the risk of regressions for existing users.


-- 
MST

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators
  2013-10-29 19:05   ` Michael Dalton
  2013-10-29 19:17     ` Michael S. Tsirkin
@ 2013-10-30  4:41     ` Jason Wang
  1 sibling, 0 replies; 12+ messages in thread
From: Jason Wang @ 2013-10-30  4:41 UTC (permalink / raw)
  To: Michael Dalton, Eric Northup
  Cc: Michael S. Tsirkin, netdev, Eric Dumazet, lf-virt,
	Daniel Borkmann, David S. Miller

On 10/30/2013 03:05 AM, Michael Dalton wrote:
> Agreed Eric, the buffer size should be increased so that we can accommodate a
> MTU-sized packet + mergeable virtio net header in a single buffer. I will send
> a patch to fix shortly cleaning up the #define headers as Rusty indicated and
> increasing the buffer size slightly by VirtioNet header size bytes per Eric.
>
> Jason, I'll followup with you directly - I'd like to know your exact workload
> (single steam or multi-stream netperf?), VM configuration, etc, and also see if
> the nit that Erichas pointed out affects your results.

It's just a single TCP_STREAM from local host to guest with vhost_net
enabled. The host and guest were connected with bridge.
>   It is also
> worth noting that
> we may want to tune the queue sizes for your benchmarks, e.g, by reducing
> buffer size from 4KB to MTU-sized but keeping queue length constant, we're
> implicitly decreasing the number of bytes stored in the VirtioQueue for the
> VirtioNet device, so increasing the queue size may help.

The problem is the overhead of introduced by the frag refill and frag
list. Looking at tg3 and bnx2x, the frag were only used for small
packets not for jumbo frame which does not need a frag list. But your
patch tires to use it for GSO packets. So we'd better to solve it in
device level (e.g multiple rings with different size of buffers).

I try to change the queue size from 256 to 1024. Unfortunately, I didn't
see improvement. As a workaround, If you care only the MTU size packet
and do not want GSO packet to be received, you can just disable it by
specifying gso_tso{4|6}=off in qemu command line or let's make it
configurable through ethtool.
>
> Best,
>
> Mike
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-10-30  4:41 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-10-28 22:44 [PATCH net-next] virtio_net: migrate mergeable rx buffers to page frag allocators Michael Dalton
2013-10-28 23:19 ` Eric Dumazet
2013-10-29  3:57   ` David Miller
2013-10-29 19:12     ` Michael S. Tsirkin
2013-10-29  7:30   ` Daniel Borkmann
2013-10-29  1:51 ` Rusty Russell
2013-10-29  6:27 ` Jason Wang
2013-10-29 18:44 ` Eric Northup
2013-10-29 19:05   ` Michael Dalton
2013-10-29 19:17     ` Michael S. Tsirkin
2013-10-30  4:41     ` Jason Wang
2013-10-29 19:05   ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).