From: Rusty Russell <rusty@rustcorp.com.au>
To: netdev@vger.kernel.org
Cc: David Miller <davem@davemloft.net>,
virtualization@lists.linux-foundation.org,
Shirley Ma <mashirle@us.ibm.com>,
"Michael S. Tsirkin" <mst@redhat.com>
Subject: [PATCH 2/2] virtio_net: Defer skb allocation in receive path Date: Wed, 13 Jan 2010 12:53:38 -0800
Date: Fri, 29 Jan 2010 23:50:04 +1030 [thread overview]
Message-ID: <201001292350.04544.rusty@rustcorp.com.au> (raw)
In-Reply-To: <201001292349.05360.rusty@rustcorp.com.au>
From: Shirley Ma <mashirle@us.ibm.com>
virtio_net receives packets from its pre-allocated vring buffers, then it
delivers these packets to upper layer protocols as skb buffs. So it's not
necessary to pre-allocate skb for each mergable buffer, then frees extra
skbs when buffers are merged into a large packet. This patch has deferred
skb allocation in receiving packets for both big packets and mergeable buffers
to reduce skb pre-allocations and skb frees. It frees unused buffers by calling
detach_unused_buf in vring, so recv skb queue is not needed.
Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/net/virtio_net.c | 427 +++++++++++++++++++++++++++--------------------
1 file changed, 248 insertions(+), 179 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c708ecc..72b3f21 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -56,8 +56,7 @@ struct virtnet_info
/* Host will merge rx buffers for big packets (shake it! shake it!) */
bool mergeable_rx_bufs;
- /* Receive & send queues. */
- struct sk_buff_head recv;
+ /* Send queue. */
struct sk_buff_head send;
/* Work struct for refilling if we run low on memory. */
@@ -75,34 +74,44 @@ struct skb_vnet_hdr {
unsigned int num_sg;
};
+struct padded_vnet_hdr {
+ struct virtio_net_hdr hdr;
+ /*
+ * virtio_net_hdr should be in a separated sg buffer because of a
+ * QEMU bug, and data sg buffer shares same page with this header sg.
+ * This padding makes next sg 16 byte aligned after virtio_net_hdr.
+ */
+ char padding[6];
+};
+
static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
{
return (struct skb_vnet_hdr *)skb->cb;
}
-static void give_a_page(struct virtnet_info *vi, struct page *page)
-{
- page->private = (unsigned long)vi->pages;
- vi->pages = page;
-}
-
-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
+/*
+ * private is used to chain pages for big packets, put the whole
+ * most recent used list in the beginning for reuse
+ */
+static void give_pages(struct virtnet_info *vi, struct page *page)
{
- unsigned int i;
+ struct page *end;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- give_a_page(vi, skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->nr_frags = 0;
- skb->data_len = 0;
+ /* Find end of list, sew whole thing into vi->pages. */
+ for (end = page; end->private; end = (struct page *)end->private);
+ end->private = (unsigned long)vi->pages;
+ vi->pages = page;
}
static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
{
struct page *p = vi->pages;
- if (p)
+ if (p) {
vi->pages = (struct page *)p->private;
- else
+ /* clear private here, it is used to chain pages */
+ p->private = 0;
+ } else
p = alloc_page(gfp_mask);
return p;
}
@@ -118,99 +127,142 @@ static void skb_xmit_done(struct virtqueue *svq)
netif_wake_queue(vi->dev);
}
-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
- unsigned len)
+static void set_skb_frag(struct sk_buff *skb, struct page *page,
+ unsigned int offset, unsigned int *len)
{
- struct virtnet_info *vi = netdev_priv(dev);
- struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
- int err;
- int i;
-
- if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
- pr_debug("%s: short packet %i\n", dev->name, len);
- dev->stats.rx_length_errors++;
- goto drop;
- }
+ int i = skb_shinfo(skb)->nr_frags;
+ skb_frag_t *f;
+
+ f = &skb_shinfo(skb)->frags[i];
+ f->size = min((unsigned)PAGE_SIZE - offset, *len);
+ f->page_offset = offset;
+ f->page = page;
+
+ skb->data_len += f->size;
+ skb->len += f->size;
+ skb_shinfo(skb)->nr_frags++;
+ *len -= f->size;
+}
- if (vi->mergeable_rx_bufs) {
- unsigned int copy;
- char *p = page_address(skb_shinfo(skb)->frags[0].page);
+static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+ struct page *page, unsigned int len)
+{
+ struct sk_buff *skb;
+ struct skb_vnet_hdr *hdr;
+ unsigned int copy, hdr_len, offset;
+ char *p;
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ p = page_address(page);
- memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
- p += sizeof(hdr->mhdr);
+ /* copy small packet so we can reuse these pages for small data */
+ skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
+ if (unlikely(!skb))
+ return NULL;
- copy = len;
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
+ hdr = skb_vnet_hdr(skb);
- memcpy(skb_put(skb, copy), p, copy);
+ if (vi->mergeable_rx_bufs) {
+ hdr_len = sizeof hdr->mhdr;
+ offset = hdr_len;
+ } else {
+ hdr_len = sizeof hdr->hdr;
+ offset = sizeof(struct padded_vnet_hdr);
+ }
- len -= copy;
+ memcpy(hdr, p, hdr_len);
- if (!len) {
- give_a_page(vi, skb_shinfo(skb)->frags[0].page);
- skb_shinfo(skb)->nr_frags--;
- } else {
- skb_shinfo(skb)->frags[0].page_offset +=
- sizeof(hdr->mhdr) + copy;
- skb_shinfo(skb)->frags[0].size = len;
- skb->data_len += len;
- skb->len += len;
- }
+ len -= hdr_len;
+ p += offset;
- while (--hdr->mhdr.num_buffers) {
- struct sk_buff *nskb;
+ copy = len;
+ if (copy > skb_tailroom(skb))
+ copy = skb_tailroom(skb);
+ memcpy(skb_put(skb, copy), p, copy);
- i = skb_shinfo(skb)->nr_frags;
- if (i >= MAX_SKB_FRAGS) {
- pr_debug("%s: packet too long %d\n", dev->name,
- len);
- dev->stats.rx_length_errors++;
- goto drop;
- }
+ len -= copy;
+ offset += copy;
- nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
- if (!nskb) {
- pr_debug("%s: rx error: %d buffers missing\n",
- dev->name, hdr->mhdr.num_buffers);
- dev->stats.rx_length_errors++;
- goto drop;
- }
+ while (len) {
+ set_skb_frag(skb, page, offset, &len);
+ page = (struct page *)page->private;
+ offset = 0;
+ }
- __skb_unlink(nskb, &vi->recv);
- vi->num--;
+ if (page)
+ give_pages(vi, page);
- skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
- skb_shinfo(nskb)->nr_frags = 0;
- kfree_skb(nskb);
+ return skb;
+}
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
+static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+{
+ struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
+ struct page *page;
+ int num_buf, i, len;
+
+ num_buf = hdr->mhdr.num_buffers;
+ while (--num_buf) {
+ i = skb_shinfo(skb)->nr_frags;
+ if (i >= MAX_SKB_FRAGS) {
+ pr_debug("%s: packet too long\n", skb->dev->name);
+ skb->dev->stats.rx_length_errors++;
+ return -EINVAL;
+ }
- skb_shinfo(skb)->frags[i].size = len;
- skb_shinfo(skb)->nr_frags++;
- skb->data_len += len;
- skb->len += len;
+ page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+ if (!page) {
+ pr_debug("%s: rx error: %d buffers missing\n",
+ skb->dev->name, hdr->mhdr.num_buffers);
+ skb->dev->stats.rx_length_errors++;
+ return -EINVAL;
}
- } else {
- len -= sizeof(hdr->hdr);
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+
+ set_skb_frag(skb, page, 0, &len);
+
+ --vi->num;
+ }
+ return 0;
+}
+
+static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct sk_buff *skb;
+ struct page *page;
+ struct skb_vnet_hdr *hdr;
- if (len <= MAX_PACKET_LEN)
- trim_pages(vi, skb);
+ if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
+ pr_debug("%s: short packet %i\n", dev->name, len);
+ dev->stats.rx_length_errors++;
+ if (vi->mergeable_rx_bufs || vi->big_packets)
+ give_pages(vi, buf);
+ else
+ dev_kfree_skb(buf);
+ return;
+ }
- err = pskb_trim(skb, len);
- if (err) {
- pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
- len, err);
+ if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+ skb = buf;
+ len -= sizeof(struct virtio_net_hdr);
+ skb_trim(skb, len);
+ } else {
+ page = buf;
+ skb = page_to_skb(vi, page, len);
+ if (unlikely(!skb)) {
dev->stats.rx_dropped++;
- goto drop;
+ give_pages(vi, page);
+ return;
}
+ if (vi->mergeable_rx_bufs)
+ if (receive_mergeable(vi, skb)) {
+ dev_kfree_skb(skb);
+ return;
+ }
}
+ hdr = skb_vnet_hdr(skb);
skb->truesize += skb->data_len;
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
@@ -267,110 +319,119 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
frame_err:
dev->stats.rx_frame_errors++;
-drop:
dev_kfree_skb(skb);
}
-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
{
struct sk_buff *skb;
- struct scatterlist sg[2+MAX_SKB_FRAGS];
- int num, err, i;
- bool oom = false;
-
- sg_init_table(sg, 2+MAX_SKB_FRAGS);
- do {
- struct skb_vnet_hdr *hdr;
+ struct skb_vnet_hdr *hdr;
+ struct scatterlist sg[2];
+ int err;
- skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
- if (unlikely(!skb)) {
- oom = true;
- break;
- }
+ skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
+ if (unlikely(!skb))
+ return -ENOMEM;
- skb_put(skb, MAX_PACKET_LEN);
+ skb_put(skb, MAX_PACKET_LEN);
- hdr = skb_vnet_hdr(skb);
- sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
+ hdr = skb_vnet_hdr(skb);
+ sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
- if (vi->big_packets) {
- for (i = 0; i < MAX_SKB_FRAGS; i++) {
- skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- f->page = get_a_page(vi, gfp);
- if (!f->page)
- break;
+ skb_to_sgvec(skb, sg + 1, 0, skb->len);
- f->page_offset = 0;
- f->size = PAGE_SIZE;
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+ if (err < 0)
+ dev_kfree_skb(skb);
- skb->data_len += PAGE_SIZE;
- skb->len += PAGE_SIZE;
+ return err;
+}
- skb_shinfo(skb)->nr_frags++;
- }
+static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+{
+ struct scatterlist sg[MAX_SKB_FRAGS + 2];
+ struct page *first, *list = NULL;
+ char *p;
+ int i, err, offset;
+
+ /* page in sg[MAX_SKB_FRAGS + 1] is list tail */
+ for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
+ first = get_a_page(vi, gfp);
+ if (!first) {
+ if (list)
+ give_pages(vi, list);
+ return -ENOMEM;
}
+ sg_set_buf(&sg[i], page_address(first), PAGE_SIZE);
- num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
- skb_queue_head(&vi->recv, skb);
+ /* chain new page in list head to match sg */
+ first->private = (unsigned long)list;
+ list = first;
+ }
- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
- if (err < 0) {
- skb_unlink(skb, &vi->recv);
- trim_pages(vi, skb);
- kfree_skb(skb);
- break;
- }
- vi->num++;
- } while (err >= num);
- if (unlikely(vi->num > vi->max))
- vi->max = vi->num;
- vi->rvq->vq_ops->kick(vi->rvq);
- return !oom;
+ first = get_a_page(vi, gfp);
+ if (!first) {
+ give_pages(vi, list);
+ return -ENOMEM;
+ }
+ p = page_address(first);
+
+ /* sg[0], sg[1] share the same page */
+ /* a separated sg[0] for virtio_net_hdr only during to QEMU bug*/
+ sg_set_buf(&sg[0], p, sizeof(struct virtio_net_hdr));
+
+ /* sg[1] for data packet, from offset */
+ offset = sizeof(struct padded_vnet_hdr);
+ sg_set_buf(&sg[1], p + offset, PAGE_SIZE - offset);
+
+ /* chain first in list head */
+ first->private = (unsigned long)list;
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2,
+ first);
+ if (err < 0)
+ give_pages(vi, first);
+
+ return err;
}
-/* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
{
- struct sk_buff *skb;
- struct scatterlist sg[1];
+ struct page *page;
+ struct scatterlist sg;
int err;
- bool oom = false;
-
- if (!vi->mergeable_rx_bufs)
- return try_fill_recv_maxbufs(vi, gfp);
- do {
- skb_frag_t *f;
+ page = get_a_page(vi, gfp);
+ if (!page)
+ return -ENOMEM;
- skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
- if (unlikely(!skb)) {
- oom = true;
- break;
- }
+ sg_init_one(&sg, page_address(page), PAGE_SIZE);
- f = &skb_shinfo(skb)->frags[0];
- f->page = get_a_page(vi, gfp);
- if (!f->page) {
- oom = true;
- kfree_skb(skb);
- break;
- }
+ err = vi->rvq->vq_ops->add_buf(vi->rvq, &sg, 0, 1, page);
+ if (err < 0)
+ give_pages(vi, page);
- f->page_offset = 0;
- f->size = PAGE_SIZE;
+ return err;
+}
- skb_shinfo(skb)->nr_frags++;
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+{
+ int err;
+ bool oom = false;
- sg_init_one(sg, page_address(f->page), PAGE_SIZE);
- skb_queue_head(&vi->recv, skb);
+ do {
+ if (vi->mergeable_rx_bufs)
+ err = add_recvbuf_mergeable(vi, gfp);
+ else if (vi->big_packets)
+ err = add_recvbuf_big(vi, gfp);
+ else
+ err = add_recvbuf_small(vi, gfp);
- err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
if (err < 0) {
- skb_unlink(skb, &vi->recv);
- kfree_skb(skb);
+ oom = true;
break;
}
- vi->num++;
+ ++vi->num;
} while (err > 0);
if (unlikely(vi->num > vi->max))
vi->max = vi->num;
@@ -408,15 +469,14 @@ static void refill_work(struct work_struct *work)
static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
- struct sk_buff *skb = NULL;
+ void *buf;
unsigned int len, received = 0;
again:
while (received < budget &&
- (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
- __skb_unlink(skb, &vi->recv);
- receive_skb(vi->dev, skb, len);
- vi->num--;
+ (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+ receive_buf(vi->dev, buf, len);
+ --vi->num;
received++;
}
@@ -496,9 +556,9 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
/* Encode metadata header at front. */
if (vi->mergeable_rx_bufs)
- sg_set_buf(sg, &hdr->mhdr, sizeof(hdr->mhdr));
+ sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
else
- sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
+ sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
return vi->svq->vq_ops->add_buf(vi->svq, sg, hdr->num_sg, 0, skb);
@@ -916,8 +976,7 @@ static int virtnet_probe(struct virtio_device *vdev)
dev->features |= NETIF_F_HW_VLAN_FILTER;
}
- /* Initialize our empty receive and send queues. */
- skb_queue_head_init(&vi->recv);
+ /* Initialize our empty send queue. */
skb_queue_head_init(&vi->send);
err = register_netdev(dev);
@@ -952,25 +1011,35 @@ free:
return err;
}
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+ void *buf;
+ while (1) {
+ buf = vi->rvq->vq_ops->detach_unused_buf(vi->rvq);
+ if (!buf)
+ break;
+ if (vi->mergeable_rx_bufs || vi->big_packets)
+ give_pages(vi, buf);
+ else
+ dev_kfree_skb(buf);
+ --vi->num;
+ }
+ BUG_ON(vi->num != 0);
+}
+
static void __devexit virtnet_remove(struct virtio_device *vdev)
{
struct virtnet_info *vi = vdev->priv;
- struct sk_buff *skb;
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
- /* Free our skbs in send and recv queues, if any. */
- while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
- kfree_skb(skb);
- vi->num--;
- }
+ /* Free our skbs in send queue, if any. */
__skb_queue_purge(&vi->send);
- BUG_ON(vi->num != 0);
-
unregister_netdev(vi->dev);
cancel_delayed_work_sync(&vi->refill);
+ free_unused_bufs(vi);
vdev->config->del_vqs(vi->vdev);
next prev parent reply other threads:[~2010-01-29 13:20 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-01-29 13:16 [PATCH 0/2] virtio net improvements Rusty Russell
2010-01-29 13:19 ` [PATCH 1/2] virtio: Add ability to detach unused buffers from vrings Rusty Russell
2010-01-29 13:19 ` Rusty Russell
2010-01-29 13:20 ` [PATCH 2/2] virtio_net: Defer skb allocation in receive path Date: Wed, 13 Jan 2010 12:53:38 -0800 Rusty Russell
2010-01-29 13:20 ` Rusty Russell [this message]
2010-02-02 21:48 ` [PATCH 1/1 net-next] virtio_net: remove send queue Shirley Ma
2010-02-02 21:52 ` [PATCH 0/1 " Shirley Ma
2010-02-03 10:03 ` [PATCH 1/1 " Michael S. Tsirkin
2010-02-08 1:41 ` Rusty Russell
2010-02-02 23:59 ` [PATCH 2/2] virtio_net: Defer skb allocation in receive path Date: Wed, 13 Jan 2010 12:53:38 -0800 David Miller
2010-02-02 23:59 ` David Miller
2010-02-02 23:59 ` [PATCH 1/2] virtio: Add ability to detach unused buffers from vrings David Miller
2010-02-02 23:59 ` David Miller
2010-02-02 23:27 ` [PATCH 0/2] virtio net improvements Rusty Russell
2010-02-02 23:27 ` Rusty Russell
2010-02-02 23:29 ` David Miller
2010-02-02 23:29 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=201001292350.04544.rusty@rustcorp.com.au \
--to=rusty@rustcorp.com.au \
--cc=davem@davemloft.net \
--cc=mashirle@us.ibm.com \
--cc=mst@redhat.com \
--cc=netdev@vger.kernel.org \
--cc=virtualization@lists.linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.