From: Shirley Ma <mashirle@us.ibm.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
Cc: David Miller <davem@davemloft.net>,
Eric Dumazet <eric.dumazet@gmail.com>,
Avi Kivity <avi@redhat.com>, Arnd Bergmann <arnd@arndb.de>,
netdev@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org
Subject: Re: [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
Date: Tue, 17 May 2011 13:50:19 -0700 [thread overview]
Message-ID: <1305665419.10756.33.camel@localhost.localdomain> (raw)
In-Reply-To: <1305646444.10756.16.camel@localhost.localdomain>
Resubmit the patch with most update. This patch passed some
live-migration test against RHEL6.2. I will run more stress test w/i
live migration.
Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
drivers/vhost/net.c | 37 +++++++++++++++++++++++++++++++-
drivers/vhost/vhost.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 12 ++++++++++
3 files changed, 101 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..6bd6e28 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,9 @@
* Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_ZEROCOPY_PEND 128
+
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
@@ -129,6 +132,7 @@ static void handle_tx(struct vhost_net *net)
int err, wmem;
size_t hdr_size;
struct socket *sock;
+ struct skb_ubuf_info pend;
/* TODO: check that we are running from vhost_worker? */
sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +155,10 @@ static void handle_tx(struct vhost_net *net)
hdr_size = vq->vhost_hlen;
for (;;) {
+ /* Release DMAs done buffers first */
+ if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
+ vhost_zerocopy_signal_used(vq, false);
+
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
&out, &in,
@@ -166,6 +174,13 @@ static void handle_tx(struct vhost_net *net)
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
break;
}
+ /* If more outstanding DMAs, queue the work */
+ if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
+ (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)) {
+ tx_poll_start(net, sock);
+ set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+ break;
+ }
if (unlikely(vhost_enable_notify(vq))) {
vhost_disable_notify(vq);
continue;
@@ -188,17 +203,35 @@ static void handle_tx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
+ /* use msg_control to pass vhost zerocopy ubuf info to skb */
+ if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+ vq->heads[vq->upend_idx].id = head;
+ if (len <= 128)
+ vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN;
+ else {
+ vq->heads[vq->upend_idx].len = len;
+ pend.callback = vhost_zerocopy_callback;
+ pend.arg = vq;
+ pend.desc = vq->upend_idx;
+ msg.msg_control = &pend;
+ msg.msg_controllen = sizeof(pend);
+ }
+ atomic_inc(&vq->refcnt);
+ vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+ }
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
- vhost_discard_vq_desc(vq, 1);
+ if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+ vhost_discard_vq_desc(vq, 1);
tx_poll_start(net, sock);
break;
}
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
total_len += len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..ce799d6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
+ vq->upend_idx = 0;
+ vq->done_idx = 0;
+ atomic_set(&vq->refcnt, 0);
}
static int vhost_worker(void *data)
@@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
UIO_MAXIOV, GFP_KERNEL);
dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
GFP_KERNEL);
- dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+ dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
UIO_MAXIOV, GFP_KERNEL);
if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
@@ -385,6 +388,38 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
return 0;
}
+/*
+ comments
+*/
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+ int i, j = 0;
+
+ i = vq->done_idx;
+ while (i != vq->upend_idx) {
+ if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+ /* reset len = 0 */
+ vq->heads[i].len = 0;
+ i = (i + 1) % UIO_MAXIOV;
+ ++j;
+ } else
+ break;
+ }
+ if (j) {
+ /* comments */
+ if (i > vq->done_idx)
+ vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+ else {
+ vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+ UIO_MAXIOV - vq->done_idx);
+ vhost_add_used_n(vq, vq->heads, i);
+ }
+ vq->done_idx = i;
+ vhost_signal(vq->dev, vq);
+ atomic_sub(j, &vq->refcnt);
+ }
+}
+
/* Caller should have device mutex */
void vhost_dev_cleanup(struct vhost_dev *dev)
{
@@ -395,6 +430,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
vhost_poll_stop(&dev->vqs[i].poll);
vhost_poll_flush(&dev->vqs[i].poll);
}
+ /* wait for all lower device DMAs done, then notify guest */
+ if (atomic_read(&dev->vqs[i].refcnt)) {
+ msleep(1000);
+ vhost_zerocopy_signal_used(&dev->vqs[i], true);
+ }
if (dev->vqs[i].error_ctx)
eventfd_ctx_put(dev->vqs[i].error_ctx);
if (dev->vqs[i].error)
@@ -603,6 +643,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
mutex_lock(&vq->mutex);
+ /* force all lower device DMAs done */
+ if (atomic_read(&vq->refcnt))
+ vhost_zerocopy_signal_used(vq, true);
+
switch (ioctl) {
case VHOST_SET_VRING_NUM:
/* Resizing ring with an active backend?
@@ -1416,3 +1460,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
vq_err(vq, "Failed to enable notification at %p: %d\n",
&vq->used->flags, r);
}
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+ int idx = skb_shinfo(skb)->ubuf.desc;
+ struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+ /* set len = 1 to mark this desc buffers done DMA */
+ vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..8e3ecc7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,10 @@
#include <linux/virtio_ring.h>
#include <asm/atomic.h>
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN 1
+
struct vhost_device;
struct vhost_work;
@@ -108,6 +112,12 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+ /* vhost zerocopy support */
+ atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+ /* copy of avail idx to monitor outstanding DMA zerocopy buffers */
+ int upend_idx;
+ /* copy of used idx to monintor DMA done zerocopy buffers */
+ int done_idx;
};
struct vhost_dev {
@@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
next prev parent reply other threads:[~2011-05-17 20:50 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-05-16 19:34 [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Shirley Ma
2011-05-16 20:45 ` Michael S. Tsirkin
2011-05-16 20:56 ` Shirley Ma
2011-05-16 21:24 ` Michael S. Tsirkin
2011-05-16 21:30 ` Shirley Ma
2011-05-17 4:31 ` Shirley Ma
2011-05-17 5:55 ` Michael S. Tsirkin
2011-05-17 15:22 ` Shirley Ma
2011-05-17 15:28 ` Michael S. Tsirkin
2011-05-17 15:34 ` Shirley Ma
2011-05-17 20:46 ` [TEST PATCH net-next] vhost: accumulate multiple used and sigal in vhost TX test Shirley Ma
2011-05-17 20:52 ` Michael S. Tsirkin
2011-05-17 20:50 ` Shirley Ma [this message]
2011-05-17 20:58 ` [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support Michael S. Tsirkin
2011-05-17 21:01 ` Shirley Ma
2011-05-17 21:28 ` Michael S. Tsirkin
2011-05-17 22:21 ` Shirley Ma
2011-05-18 5:14 ` Shirley Ma
2011-05-18 6:16 ` [PATCH V6 " Shirley Ma
2011-05-18 8:43 ` Michael S. Tsirkin
2011-05-18 8:32 ` [PATCH V5 " Michael S. Tsirkin
2011-05-18 8:45 ` Michael S. Tsirkin
2011-05-16 21:27 ` Michael S. Tsirkin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1305665419.10756.33.camel@localhost.localdomain \
--to=mashirle@us.ibm.com \
--cc=arnd@arndb.de \
--cc=avi@redhat.com \
--cc=davem@davemloft.net \
--cc=eric.dumazet@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mst@redhat.com \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).