From: Shirley Ma <mashirle@us.ibm.com>
To: mst@redhat.com, David Miller <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>,
Avi Kivity <avi@redhat.com>, Arnd Bergmann <arnd@arndb.de>,
netdev@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org
Subject: [PATCH V3 4/8] vhost TX zero copy support
Date: Wed, 20 Apr 2011 13:07:37 -0700 [thread overview]
Message-ID: <1303330057.19336.50.camel@localhost.localdomain> (raw)
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>
This patch maintains the outstanding userspace buffers in the
sequence it is delivered to vhost. The outstanding userspace buffers
will be marked as done once the lower device buffers DMA has finished.
This is monitored through last reference of kfree_skb callback. Two
buffer index are used for this purpose.
The vhost passes the userspace buffers info to lower device skb
through message control. Since there will be some done DMAs when
entering vhost handle_tx. The worse case is all buffers in the vq are
in pending/done status, so we need to notify guest to release DMA done
buffers first before get any new buffers from the vq.
Signed-off-by: Shirley <xma@us.ibm.com>
---
drivers/vhost/net.c | 30 +++++++++++++++++++++++++++-
drivers/vhost/vhost.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 10 +++++++++
3 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..1bc4536 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,8 @@
* Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000
+#define MAX_ZEROCOPY_PEND 64
+
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
@@ -129,6 +131,7 @@ static void handle_tx(struct vhost_net *net)
int err, wmem;
size_t hdr_size;
struct socket *sock;
+ struct skb_ubuf_info pend;
/* TODO: check that we are running from vhost_worker? */
sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +154,10 @@ static void handle_tx(struct vhost_net *net)
hdr_size = vq->vhost_hlen;
for (;;) {
+ /* Release DMAs done buffers first */
+ if (sock_flag(sock->sk, SOCK_ZEROCOPY))
+ vhost_zerocopy_signal_used(vq);
+
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
&out, &in,
@@ -166,6 +173,12 @@ static void handle_tx(struct vhost_net *net)
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
break;
}
+ /* If more outstanding DMAs, queue the work */
+ if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
+ (atomic_read(&vq->refcnt) > MAX_ZEROCOPY_PEND)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
if (unlikely(vhost_enable_notify(vq))) {
vhost_disable_notify(vq);
continue;
@@ -188,17 +201,30 @@ static void handle_tx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
+ /* use msg_control to pass vhost zerocopy ubuf info to skb */
+ if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+ pend.callback = vhost_zerocopy_callback;
+ pend.arg = vq;
+ pend.desc = vq->upend_idx;
+ msg.msg_control = &pend;
+ msg.msg_controllen = sizeof(pend);
+ vq->heads[vq->upend_idx].id = head;
+ vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+ atomic_inc(&vq->refcnt);
+ }
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
- vhost_discard_vq_desc(vq, 1);
+ if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+ vhost_discard_vq_desc(vq, 1);
tx_poll_start(net, sock);
break;
}
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
total_len += len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..09bcb1d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
+ vq->upend_idx = 0;
+ vq->done_idx = 0;
+ atomic_set(&vq->refcnt, 0);
}
static int vhost_worker(void *data)
@@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
UIO_MAXIOV, GFP_KERNEL);
dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
GFP_KERNEL);
- dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+ dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
UIO_MAXIOV, GFP_KERNEL);
if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
@@ -385,10 +388,41 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
return 0;
}
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+{
+ int i, j = 0;
+
+ i = vq->done_idx;
+ while (i != vq->upend_idx) {
+ /* len = 1 means DMA done */
+ if (vq->heads[i].len == 1) {
+ /* reset len = 0 */
+ vq->heads[i].len = 0;
+ i = (i + 1) % UIO_MAXIOV;
+ ++j;
+ } else
+ break;
+ }
+ if (j) {
+ if (i > vq->done_idx)
+ vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+ else {
+ vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+ UIO_MAXIOV - vq->done_idx);
+ vhost_add_used_n(vq, vq->heads, i);
+ }
+ vq->done_idx = i;
+ vhost_signal(vq->dev, vq);
+ atomic_sub(j, &vq->refcnt);
+ }
+}
+
/* Caller should have device mutex */
void vhost_dev_cleanup(struct vhost_dev *dev)
{
int i;
+ unsigned long begin = jiffies;
+
for (i = 0; i < dev->nvqs; ++i) {
if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
@@ -405,6 +439,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
eventfd_ctx_put(dev->vqs[i].call_ctx);
if (dev->vqs[i].call)
fput(dev->vqs[i].call);
+ /* wait for all lower device DMAs done, then notify guest */
+ while (atomic_read(&dev->vqs[i].refcnt)) {
+ if (time_after(jiffies, begin + 5 * HZ))
+ vhost_zerocopy_signal_used(&dev->vqs[i]);
+ }
vhost_vq_reset(dev, dev->vqs + i);
}
vhost_dev_free_iovecs(dev);
@@ -1416,3 +1455,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
vq_err(vq, "Failed to enable notification at %p: %d\n",
&vq->used->flags, r);
}
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+ int idx = skb_shinfo(skb)->ubuf.desc;
+ struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+ /* set len = 1 to mark this desc buffers done DMA */
+ vq->heads[idx].len = 1;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..cd2febb 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,14 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+ /* vhost zerocopy support */
+ atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+ /* index of zerocopy pending DMA buffers */
+ int upend_idx;
+ /* index of zerocopy done DMA buffers, but not notify guest yet */
+ int done_idx;
+ /* notify vhost zerocopy DMA buffers has done in lower device */
+ void (*callback)(struct sk_buff *);
};
struct vhost_dev {
@@ -154,6 +162,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
next prev parent reply other threads:[~2011-04-20 20:07 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-04-20 19:36 [PATCH V3 0/8] macvtap/vhost TX zero copy support Shirley Ma
2011-04-20 19:42 ` [PATCH V3 1/8] Add a new sock zerocopy flag Shirley Ma
2011-04-20 19:44 ` [PATCH V3 2/8] Add a new zerocopy device flag Shirley Ma
2011-04-20 19:48 ` Ben Hutchings
2011-04-20 20:05 ` Shirley Ma
2011-04-20 20:09 ` Shirley Ma
2011-04-20 20:24 ` Dimitris Michailidis
2011-04-20 20:28 ` Shirley Ma
2011-04-20 20:30 ` Shirley Ma
2011-05-02 10:42 ` Michael S. Tsirkin
2011-05-02 18:47 ` Shirley Ma
2011-05-02 19:53 ` Michael S. Tsirkin
2011-05-03 17:42 ` Shirley Ma
2011-05-03 20:11 ` Shirley Ma
2011-04-20 19:47 ` [PATCH V3 3/8] Add userspace buffers support in skb Shirley Ma
2011-05-02 10:53 ` Michael S. Tsirkin
2011-05-03 17:36 ` Shirley Ma
2011-04-20 20:07 ` Shirley Ma [this message]
2011-04-20 20:12 ` [PATCH V3 0/8] macvtap/vhost TX zero copy support Shirley Ma
2011-04-20 20:13 ` [PATCH V3 5/8] Enable cxgb3 to support zerocopy Shirley Ma
2011-04-20 20:52 ` Dimitris Michailidis
2011-04-20 20:58 ` Shirley Ma
2011-04-20 21:15 ` Shirley Ma
2011-04-20 20:15 ` [PATCH V3 7/8] Enable ixgbe " Shirley Ma
2011-04-20 20:17 ` [PATCH V3 8/8] Enable benet " Shirley Ma
2011-04-20 20:27 ` [PATCH V3 6/8] macvtap/vhost TX zero copy support Shirley Ma
2011-05-02 18:35 ` Shirley Ma
2011-04-21 14:29 ` [PATCH V3 0/8] " Jon Mason
2011-04-22 17:31 ` Shirley Ma
2011-04-22 17:52 ` Shirley Ma
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1303330057.19336.50.camel@localhost.localdomain \
--to=mashirle@us.ibm.com \
--cc=arnd@arndb.de \
--cc=avi@redhat.com \
--cc=davem@davemloft.net \
--cc=eric.dumazet@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mst@redhat.com \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).