[PATCHv7] add mergeable buffers support to vhost

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCHv7] add mergeable buffers support to vhost_net
@ 2010-04-28 20:57 David L Stevens
  2010-04-29 13:45 ` Michael S. Tsirkin
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: David L Stevens @ 2010-04-28 20:57 UTC (permalink / raw)
  To: mst; +Cc: netdev, kvm, virtualization

This patch adds mergeable receive buffer support to vhost_net.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v7/drivers/vhost/net.c
--- net-next-v0/drivers/vhost/net.c	2010-04-24 21:36:54.000000000 -0700
+++ net-next-v7/drivers/vhost/net.c	2010-04-28 12:26:18.000000000 -0700
@@ -74,6 +74,23 @@ static int move_iovec_hdr(struct iovec *
 	}
 	return seg;
 }
+/* Copy iovec entries for len bytes from iovec. Return segments used. */
+static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+			  size_t len, int iovcount)
+{
+	int seg = 0;
+	size_t size;
+	while (len && seg < iovcount) {
+		size = min(from->iov_len, len);
+		to->iov_base = from->iov_base;
+		to->iov_len = size;
+		len -= size;
+		++from;
+		++to;
+		++seg;
+	}
+	return seg;
+}
 
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
@@ -109,7 +126,7 @@ static void handle_tx(struct vhost_net *
 	};
 	size_t len, total_len = 0;
 	int err, wmem;
-	size_t hdr_size;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock)
 		return;
@@ -128,13 +145,13 @@ static void handle_tx(struct vhost_net *
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head = vhost_get_desc(&net->dev, vq, vq->iov,
+				      ARRAY_SIZE(vq->iov),
+				      &out, &in,
+				      NULL, NULL);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
 			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
@@ -155,20 +172,20 @@ static void handle_tx(struct vhost_net *
 			break;
 		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, out);
 		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
@@ -187,12 +204,25 @@ static void handle_tx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }
 
+static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+{
+	struct sk_buff *head;
+	int len = 0;
+
+	lock_sock(sk);
+	head = skb_peek(&sk->sk_receive_queue);
+	if (head)
+		len = head->len + vq->sock_hlen;
+	release_sock(sk);
+	return len;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned head, out, in, log, s;
+	unsigned in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -203,14 +233,14 @@ static void handle_rx(struct vhost_net *
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	struct virtio_net_hdr_mrg_rxbuf hdr = {
+		.hdr.flags = 0,
+		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 
 	size_t len, total_len = 0;
-	int err;
-	size_t hdr_size;
+	int err, headcount, datalen;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
@@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 vq_log, &log);
+	while ((datalen = vhost_head_len(vq, sock->sk))) {
+		headcount = vhost_get_desc_n(vq, vq->heads,
+					     datalen + vhost_hlen,
+					     &in, vq_log, &log);
+		if (headcount < 0)
+			break;
 		/* OK, now we need to know about added descriptors. */
-		if (head == vq->num) {
+		if (!headcount) {
 			if (unlikely(vhost_enable_notify(vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (out) {
-			vq_err(vq, "Unexpected descriptor format for RX: "
-			       "out %d, int %d\n",
-			       out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+		if (vhost_hlen)
+			/* Skip header. TODO: support TSO. */
+			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+		else
+			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		/* TODO: Should check and handle checksum. */
-		if (err > len) {
-			pr_err("Discarded truncated rx packet: "
-			       " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq);
+		if (err != datalen) {
+			pr_err("Discarded rx packet: "
+			       " len %d, expected %zd\n", err, datalen);
+			vhost_discard_desc(vq, headcount);
 			continue;
 		}
 		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
-		if (err) {
-			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
-			       vq->iov->iov_base, err);
+		if (vhost_hlen &&
+		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
+				      vhost_hlen)) {
+			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
+			       vq->iov->iov_base);
 			break;
 		}
-		len += hdr_size;
-		vhost_add_used_and_signal(&net->dev, vq, head, len);
+		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
+		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
+				      offsetof(typeof(hdr), num_buffers),
+				      sizeof(hdr.num_buffers))) {
+			vq_err(vq, "Failed num_buffers write");
+			vhost_discard_desc(vq, headcount);
+			break;
+		}
+		len += vhost_hlen;
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    headcount);
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, len);
 		total_len += len;
@@ -561,9 +599,21 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
+	size_t vhost_hlen, sock_hlen, hdr_len;
 	int i;
+
+	hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+			sizeof(struct virtio_net_hdr);
+	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+		/* vhost provides vnet_hdr */
+		vhost_hlen = hdr_len;
+		sock_hlen = 0;
+	} else {
+		/* socket provides vnet_hdr */
+		vhost_hlen = 0;
+		sock_hlen = hdr_len;
+	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
 	    !vhost_log_access_ok(&n->dev)) {
@@ -574,7 +624,8 @@ static int vhost_net_set_features(struct
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
+		n->vqs[i].vhost_hlen = vhost_hlen;
+		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].mutex);
 	}
 	vhost_net_flush(n);
diff -ruNp net-next-v0/drivers/vhost/vhost.c net-next-v7/drivers/vhost/vhost.c
--- net-next-v0/drivers/vhost/vhost.c	2010-04-22 11:31:57.000000000 -0700
+++ net-next-v7/drivers/vhost/vhost.c	2010-04-28 11:16:13.000000000 -0700
@@ -114,7 +114,8 @@ static void vhost_vq_reset(struct vhost_
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
-	vq->hdr_size = 0;
+	vq->vhost_hlen = 0;
+	vq->sock_hlen = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -861,6 +862,53 @@ static unsigned get_indirect(struct vhos
 	return 0;
 }
 
+/* This is a multi-buffer version of vhost_get_desc
+ * @vq		- the relevant virtqueue
+ * datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		     int datalen, unsigned *iovcount, struct vhost_log *log,
+		     unsigned int *log_num)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	int r;
+
+	while (datalen > 0) {
+		if (headcount >= VHOST_NET_MAX_SG) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
+					      ARRAY_SIZE(vq->iov) - seg, &out,
+					      &in, log, log_num);
+		if (heads[headcount].id == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (out || in <= 0) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		heads[headcount].len = iov_length(vq->iov + seg, in);
+		datalen -= heads[headcount].len;
+		++headcount;
+		seg += in;
+	}
+	*iovcount = seg;
+	return headcount;
+err:
+	vhost_discard_desc(vq, headcount);
+	return r;
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -868,7 +916,7 @@ static unsigned get_indirect(struct vhos
  *
  * This function returns the descriptor number found, or vq->num (which
  * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num)
@@ -986,9 +1034,9 @@ unsigned vhost_get_vq_desc(struct vhost_
 }
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
+void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
 {
-	vq->last_avail_idx--;
+	vq->last_avail_idx -= n;
 }
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
@@ -1033,6 +1081,68 @@ int vhost_add_used(struct vhost_virtqueu
 	return 0;
 }
 
+static void vhost_log_used(struct vhost_virtqueue *vq,
+			   struct vring_used_elem __user *used)
+{
+	/* Make sure data is seen before log. */
+	smp_wmb();
+	/* Log used ring entry write. */
+	log_write(vq->log_base,
+		  vq->log_addr +
+		   ((void __user *)used - (void __user *)vq->used),
+		  sizeof *used);
+	/* Log used index update. */
+	log_write(vq->log_base,
+		  vq->log_addr + offsetof(struct vring_used, idx),
+		  sizeof vq->used->idx);
+	if (vq->log_ctx)
+		eventfd_signal(vq->log_ctx, 1);
+}
+
+static int __vhost_add_used_n(struct vhost_virtqueue *vq,
+			    struct vring_used_elem *heads,
+			    unsigned count)
+{
+	struct vring_used_elem __user *used;
+	int start;
+
+	start = vq->last_used_idx % vq->num;
+	used = vq->used->ring + start;
+	if (copy_to_user(used, heads, count * sizeof *used)) {
+		vq_err(vq, "Failed to write used");
+		return -EFAULT;
+	}
+	/* Make sure buffer is written before we update index. */
+	smp_wmb();
+	if (put_user(vq->last_used_idx + count, &vq->used->idx)) {
+		vq_err(vq, "Failed to increment used idx");
+		return -EFAULT;
+	}
+	if (unlikely(vq->log_used))
+		vhost_log_used(vq, used);
+	vq->last_used_idx += count;
+	return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		     unsigned count)
+{
+	int start, n, r;
+
+	start = vq->last_used_idx % vq->num;
+	n = vq->num - start;
+	if (n < count) {
+		r = __vhost_add_used_n(vq, heads, n);
+		if (r < 0)
+			return r;
+		heads += n;
+		count -= n;
+	}
+	return __vhost_add_used_n(vq, heads, count);
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
@@ -1062,6 +1172,15 @@ void vhost_add_used_and_signal(struct vh
 	vhost_signal(dev, vq);
 }
 
+/* multi-buffer version of vhost_add_used_and_signal */
+void vhost_add_used_and_signal_n(struct vhost_dev *dev,
+				 struct vhost_virtqueue *vq,
+				 struct vring_used_elem *heads, unsigned count)
+{
+	vhost_add_used_n(vq, heads, count);
+	vhost_signal(dev, vq);
+}
+
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_virtqueue *vq)
 {
@@ -1086,7 +1205,7 @@ bool vhost_enable_notify(struct vhost_vi
 		return false;
 	}
 
-	return avail_idx != vq->last_avail_idx;
+	return avail_idx != vq->avail_idx;
 }
 
 /* We don't need to be notified again. */
diff -ruNp net-next-v0/drivers/vhost/vhost.h net-next-v7/drivers/vhost/vhost.h
--- net-next-v0/drivers/vhost/vhost.h	2010-04-24 21:37:41.000000000 -0700
+++ net-next-v7/drivers/vhost/vhost.h	2010-04-26 10:35:25.000000000 -0700
@@ -84,7 +84,9 @@ struct vhost_virtqueue {
 	struct iovec indirect[VHOST_NET_MAX_SG];
 	struct iovec iov[VHOST_NET_MAX_SG];
 	struct iovec hdr[VHOST_NET_MAX_SG];
-	size_t hdr_size;
+	size_t vhost_hlen;
+	size_t sock_hlen;
+	struct vring_used_elem heads[VHOST_NET_MAX_SG];
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
@@ -120,16 +122,23 @@ long vhost_dev_ioctl(struct vhost_dev *,
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		     int datalen, unsigned int *iovcount, struct vhost_log *log,
+		     unsigned int *log_num);
+unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *);
+void vhost_discard_desc(struct vhost_virtqueue *, int);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int head, int len);
+			       unsigned int id, int len);
+void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
+			       struct vring_used_elem *heads, unsigned count);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_virtqueue *);
 bool vhost_enable_notify(struct vhost_virtqueue *);
 
@@ -149,7 +158,8 @@ enum {
 	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
 			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
 			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
@ 2010-04-29 13:45 ` Michael S. Tsirkin
  2010-04-30 16:48   ` David Stevens
  2010-04-29 14:12 ` Michael S. Tsirkin
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-04-29 13:45 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

I have applied this, thanks very much!
I have also applied some tweaks on top,
please take a look.

Thanks,
MSt

commit 2809e94f5f26d89dc5232aaec753ffda95c4d95e
Author: Michael S. Tsirkin <mst@redhat.com>
Date:   Thu Apr 29 16:18:08 2010 +0300

    vhost-net: minor tweaks in mergeable buffer code
    
    Applies the following tweaks on top of mergeable buffers patch:
    1. vhost_get_desc_n assumes that all desriptors are 'in' only.
       It's also unlikely to be useful for any vhost frontend
       besides vhost_net, so just move it to net.c, and rename
       get_rx_bufs for brevity.
    
    2. for rx, we called iov_length within vhost_get_desc_n
       (now get_rx_bufs) already, so we don't
       need an extra call to iov_length to avoid overflow anymore.
       Accordingly, copy_iovec_hdr can return void now.
    
    3. for rx, do some further code tweaks:
       do not assign len = err as we check that err == len
       handle data length in a way similar to how we handle
       header length: datalen -> sock_len, len -> vhost_len.
       add sock_hlen as a local variable, for symmetry with vhost_hlen.

    4. add some likely/unlikely annotations
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d61d945..85519b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,9 +74,9 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
 	}
 	return seg;
 }
-/* Copy iovec entries for len bytes from iovec. Return segments used. */
-static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
-			  size_t len, int iovcount)
+/* Copy iovec entries for len bytes from iovec. */
+static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+			   size_t len, int iovcount)
 {
 	int seg = 0;
 	size_t size;
@@ -89,7 +89,6 @@ static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
 		++to;
 		++seg;
 	}
-	return seg;
 }
 
 /* Caller must have TX VQ lock */
@@ -204,7 +203,7 @@ static void handle_tx(struct vhost_net *net)
 	unuse_mm(net->dev.mm);
 }
 
-static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 {
 	struct sk_buff *head;
 	int len = 0;
@@ -212,17 +211,70 @@ static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 	lock_sock(sk);
 	head = skb_peek(&sk->sk_receive_queue);
 	if (head)
-		len = head->len + vq->sock_hlen;
+		len = head->len;
 	release_sock(sk);
 	return len;
 }
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+static int get_rx_bufs(struct vhost_virtqueue *vq,
+		       struct vring_used_elem *heads,
+		       int datalen,
+		       unsigned *iovcount,
+		       struct vhost_log *log,
+		       unsigned *log_num)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r;
+
+	while (datalen > 0) {
+		if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		d = vhost_get_desc(vq->dev, vq, vq->iov + seg,
+				   ARRAY_SIZE(vq->iov) - seg, &out,
+				   &in, log, log_num);
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		heads[headcount].id = d;
+		heads[headcount].len = iov_length(vq->iov + seg, in);
+		datalen -= heads[headcount].len;
+		++headcount;
+		seg += in;
+	}
+	*iovcount = seg;
+	return headcount;
+err:
+	vhost_discard_desc(vq, headcount);
+	return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned in, log, s;
+	unsigned uninitialized_var(in), log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -238,9 +290,10 @@ static void handle_rx(struct vhost_net *net)
 		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 
-	size_t len, total_len = 0;
-	int err, headcount, datalen;
-	size_t vhost_hlen;
+	size_t total_len = 0;
+	int err, headcount;
+	size_t vhost_hlen, sock_hlen;
+	size_t vhost_len, sock_len;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
@@ -249,14 +302,16 @@ static void handle_rx(struct vhost_net *net)
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 	vhost_hlen = vq->vhost_hlen;
+	sock_hlen = vq->sock_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	while ((datalen = vhost_head_len(vq, sock->sk))) {
-		headcount = vhost_get_desc_n(vq, vq->heads,
-					     datalen + vhost_hlen,
-					     &in, vq_log, &log);
+	while ((sock_len = peek_head_len(vq, sock->sk))) {
+		sock_len += sock_hlen;
+		vhost_len = sock_len + vhost_hlen;
+		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+					&in, vq_log, &log);
 		if (headcount < 0)
 			break;
 		/* OK, now we need to know about added descriptors. */
@@ -272,34 +327,25 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (vhost_hlen)
+		if (unlikely((vhost_hlen)))
 			/* Skip header. TODO: support TSO. */
-			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+			move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
 		else
-			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
+			copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
 		msg.msg_iovlen = in;
-		len = iov_length(vq->iov, in);
-		/* Sanity check */
-		if (!len) {
-			vq_err(vq, "Unexpected header len for RX: "
-			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), vhost_hlen);
-			break;
-		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
-					 len, MSG_DONTWAIT | MSG_TRUNC);
+					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
 			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		if (err != datalen) {
+		if (err != sock_len) {
 			pr_err("Discarded rx packet: "
-			       " len %d, expected %zd\n", err, datalen);
+			       " len %d, expected %zd\n", err, sock_len);
 			vhost_discard_desc(vq, headcount);
 			continue;
 		}
-		len = err;
 		if (vhost_hlen &&
 		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
 				      vhost_hlen)) {
@@ -311,17 +357,16 @@ static void handle_rx(struct vhost_net *net)
 		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
 		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 				      offsetof(typeof(hdr), num_buffers),
-				      sizeof(hdr.num_buffers))) {
+				      sizeof hdr.num_buffers)) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		len += vhost_hlen;
 		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
 					    headcount);
 		if (unlikely(vq_log))
-			vhost_log_write(vq, vq_log, log, len);
-		total_len += len;
+			vhost_log_write(vq, vq_log, log, vhost_len);
+		total_len += vhost_len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8ef5e3f..4b49991 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -862,53 +862,6 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This is a multi-buffer version of vhost_get_desc
- * @vq		- the relevant virtqueue
- * datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- *	returns number of buffer heads allocated, negative on error
- */
-int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
-		     int datalen, unsigned *iovcount, struct vhost_log *log,
-		     unsigned int *log_num)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	int r;
-
-	while (datalen > 0) {
-		if (headcount >= VHOST_NET_MAX_SG) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
-					      ARRAY_SIZE(vq->iov) - seg, &out,
-					      &in, log, log_num);
-		if (heads[headcount].id == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (out || in <= 0) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		heads[headcount].len = iov_length(vq->iov + seg, in);
-		datalen -= heads[headcount].len;
-		++headcount;
-		seg += in;
-	}
-	*iovcount = seg;
-	return headcount;
-err:
-	vhost_discard_desc(vq, headcount);
-	return r;
-}
-
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 08d740a..4c9809e 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -122,9 +122,6 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
-		     int datalen, unsigned int *iovcount, struct vhost_log *log,
-		     unsigned int *log_num);
 unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,

-- 
MST

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-29 13:45 ` Michael S. Tsirkin
@ 2010-04-30 16:48   ` David Stevens
  0 siblings, 0 replies; 14+ messages in thread
From: David Stevens @ 2010-04-30 16:48 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, netdev, virtualization

"Michael S. Tsirkin" <mst@redhat.com> wrote on 04/29/2010 06:45:15 AM:

> On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > This patch adds mergeable receive buffer support to vhost_net.
> > 
> > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> 
> I have applied this, thanks very much!
> I have also applied some tweaks on top,
> please take a look.
> 
> Thanks,
> MSt
> 

Looks fine to me.

Acked-by: David L Stevens <dlstevens@us.ibm.com>

> commit 2809e94f5f26d89dc5232aaec753ffda95c4d95e
> Author: Michael S. Tsirkin <mst@redhat.com>
> Date:   Thu Apr 29 16:18:08 2010 +0300
> 
>     vhost-net: minor tweaks in mergeable buffer code
> 
>     Applies the following tweaks on top of mergeable buffers patch:
>     1. vhost_get_desc_n assumes that all desriptors are 'in' only.
>        It's also unlikely to be useful for any vhost frontend
>        besides vhost_net, so just move it to net.c, and rename
>        get_rx_bufs for brevity.
> 
>     2. for rx, we called iov_length within vhost_get_desc_n
>        (now get_rx_bufs) already, so we don't
>        need an extra call to iov_length to avoid overflow anymore.
>        Accordingly, copy_iovec_hdr can return void now.
> 
>     3. for rx, do some further code tweaks:
>        do not assign len = err as we check that err == len
>        handle data length in a way similar to how we handle
>        header length: datalen -> sock_len, len -> vhost_len.
>        add sock_hlen as a local variable, for symmetry with vhost_hlen.
> 
>     4. add some likely/unlikely annotations
> 
>     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index d61d945..85519b4 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -74,9 +74,9 @@ static int move_iovec_hdr(struct iovec *from, struct 
iovec *to,
>     }
>     return seg;
>  }
> -/* Copy iovec entries for len bytes from iovec. Return segments used. 
*/
> -static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
> -           size_t len, int iovcount)
> +/* Copy iovec entries for len bytes from iovec. */
> +static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
> +            size_t len, int iovcount)
>  {
>     int seg = 0;
>     size_t size;
> @@ -89,7 +89,6 @@ static int copy_iovec_hdr(const struct iovec *from, 
struct iovec *to,
>        ++to;
>        ++seg;
>     }
> -   return seg;
>  }
> 
>  /* Caller must have TX VQ lock */
> @@ -204,7 +203,7 @@ static void handle_tx(struct vhost_net *net)
>     unuse_mm(net->dev.mm);
>  }
> 
> -static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
> +static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
>  {
>     struct sk_buff *head;
>     int len = 0;
> @@ -212,17 +211,70 @@ static int vhost_head_len(struct vhost_virtqueue 
*vq, 
> struct sock *sk)
>     lock_sock(sk);
>     head = skb_peek(&sk->sk_receive_queue);
>     if (head)
> -      len = head->len + vq->sock_hlen;
> +      len = head->len;
>     release_sock(sk);
>     return len;
>  }
> 
> +/* This is a multi-buffer version of vhost_get_desc, that works if
> + *   vq has read descriptors only.
> + * @vq      - the relevant virtqueue
> + * @datalen   - data length we'll be reading
> + * @iovcount   - returned count of io vectors we fill
> + * @log      - vhost log
> + * @log_num   - log offset
> + *   returns number of buffer heads allocated, negative on error
> + */
> +static int get_rx_bufs(struct vhost_virtqueue *vq,
> +             struct vring_used_elem *heads,
> +             int datalen,
> +             unsigned *iovcount,
> +             struct vhost_log *log,
> +             unsigned *log_num)
> +{
> +   unsigned int out, in;
> +   int seg = 0;
> +   int headcount = 0;
> +   unsigned d;
> +   int r;
> +
> +   while (datalen > 0) {
> +      if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
> +         r = -ENOBUFS;
> +         goto err;
> +      }
> +      d = vhost_get_desc(vq->dev, vq, vq->iov + seg,
> +               ARRAY_SIZE(vq->iov) - seg, &out,
> +               &in, log, log_num);
> +      if (d == vq->num) {
> +         r = 0;
> +         goto err;
> +      }
> +      if (unlikely(out || in <= 0)) {
> +         vq_err(vq, "unexpected descriptor format for RX: "
> +            "out %d, in %d\n", out, in);
> +         r = -EINVAL;
> +         goto err;
> +      }
> +      heads[headcount].id = d;
> +      heads[headcount].len = iov_length(vq->iov + seg, in);
> +      datalen -= heads[headcount].len;
> +      ++headcount;
> +      seg += in;
> +   }
> +   *iovcount = seg;
> +   return headcount;
> +err:
> +   vhost_discard_desc(vq, headcount);
> +   return r;
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>     struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -   unsigned in, log, s;
> +   unsigned uninitialized_var(in), log;
>     struct vhost_log *vq_log;
>     struct msghdr msg = {
>        .msg_name = NULL,
> @@ -238,9 +290,10 @@ static void handle_rx(struct vhost_net *net)
>        .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
>     };
> 
> -   size_t len, total_len = 0;
> -   int err, headcount, datalen;
> -   size_t vhost_hlen;
> +   size_t total_len = 0;
> +   int err, headcount;
> +   size_t vhost_hlen, sock_hlen;
> +   size_t vhost_len, sock_len;
>     struct socket *sock = rcu_dereference(vq->private_data);
>     if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>        return;
> @@ -249,14 +302,16 @@ static void handle_rx(struct vhost_net *net)
>     mutex_lock(&vq->mutex);
>     vhost_disable_notify(vq);
>     vhost_hlen = vq->vhost_hlen;
> +   sock_hlen = vq->sock_hlen;
> 
>     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>        vq->log : NULL;
> 
> -   while ((datalen = vhost_head_len(vq, sock->sk))) {
> -      headcount = vhost_get_desc_n(vq, vq->heads,
> -                    datalen + vhost_hlen,
> -                    &in, vq_log, &log);
> +   while ((sock_len = peek_head_len(vq, sock->sk))) {
> +      sock_len += sock_hlen;
> +      vhost_len = sock_len + vhost_hlen;
> +      headcount = get_rx_bufs(vq, vq->heads, vhost_len,
> +               &in, vq_log, &log);
>        if (headcount < 0)
>           break;
>        /* OK, now we need to know about added descriptors. */
> @@ -272,34 +327,25 @@ static void handle_rx(struct vhost_net *net)
>           break;
>        }
>        /* We don't need to be notified again. */
> -      if (vhost_hlen)
> +      if (unlikely((vhost_hlen)))
>           /* Skip header. TODO: support TSO. */
> -         s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> +         move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
>        else
> -         s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
> +         copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
>        msg.msg_iovlen = in;
> -      len = iov_length(vq->iov, in);
> -      /* Sanity check */
> -      if (!len) {
> -         vq_err(vq, "Unexpected header len for RX: "
> -                "%zd expected %zd\n",
> -                iov_length(vq->hdr, s), vhost_hlen);
> -         break;
> -      }
>        err = sock->ops->recvmsg(NULL, sock, &msg,
> -                len, MSG_DONTWAIT | MSG_TRUNC);
> +                sock_len, MSG_DONTWAIT | MSG_TRUNC);
>        /* TODO: Check specific error and bomb out unless EAGAIN? */
>        if (err < 0) {
>           vhost_discard_desc(vq, headcount);
>           break;
>        }
> -      if (err != datalen) {
> +      if (err != sock_len) {
>           pr_err("Discarded rx packet: "
> -                " len %d, expected %zd\n", err, datalen);
> +                " len %d, expected %zd\n", err, sock_len);
>           vhost_discard_desc(vq, headcount);
>           continue;
>        }
> -      len = err;
>        if (vhost_hlen &&
>            memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
>                    vhost_hlen)) {
> @@ -311,17 +357,16 @@ static void handle_rx(struct vhost_net *net)
>        if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
>            memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
>                    offsetof(typeof(hdr), num_buffers),
> -                  sizeof(hdr.num_buffers))) {
> +                  sizeof hdr.num_buffers)) {
>           vq_err(vq, "Failed num_buffers write");
>           vhost_discard_desc(vq, headcount);
>           break;
>        }
> -      len += vhost_hlen;
>        vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
>                     headcount);
>        if (unlikely(vq_log))
> -         vhost_log_write(vq, vq_log, log, len);
> -      total_len += len;
> +         vhost_log_write(vq, vq_log, log, vhost_len);
> +      total_len += vhost_len;
>        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>           vhost_poll_queue(&vq->poll);
>           break;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8ef5e3f..4b49991 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -862,53 +862,6 @@ static unsigned get_indirect(struct vhost_dev *dev, 

> struct vhost_virtqueue *vq,
>     return 0;
>  }
> 
> -/* This is a multi-buffer version of vhost_get_desc
> - * @vq      - the relevant virtqueue
> - * datalen   - data length we'll be reading
> - * @iovcount   - returned count of io vectors we fill
> - * @log      - vhost log
> - * @log_num   - log offset
> - *   returns number of buffer heads allocated, negative on error
> - */
> -int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem 
*heads,
> -           int datalen, unsigned *iovcount, struct vhost_log *log,
> -           unsigned int *log_num)
> -{
> -   unsigned int out, in;
> -   int seg = 0;
> -   int headcount = 0;
> -   int r;
> -
> -   while (datalen > 0) {
> -      if (headcount >= VHOST_NET_MAX_SG) {
> -         r = -ENOBUFS;
> -         goto err;
> -      }
> -      heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
> -                     ARRAY_SIZE(vq->iov) - seg, &out,
> -                     &in, log, log_num);
> -      if (heads[headcount].id == vq->num) {
> -         r = 0;
> -         goto err;
> -      }
> -      if (out || in <= 0) {
> -         vq_err(vq, "unexpected descriptor format for RX: "
> -            "out %d, in %d\n", out, in);
> -         r = -EINVAL;
> -         goto err;
> -      }
> -      heads[headcount].len = iov_length(vq->iov + seg, in);
> -      datalen -= heads[headcount].len;
> -      ++headcount;
> -      seg += in;
> -   }
> -   *iovcount = seg;
> -   return headcount;
> -err:
> -   vhost_discard_desc(vq, headcount);
> -   return r;
> -}
> -
>  /* This looks in the virtqueue and for the first available buffer, and 
converts
>   * it to an iovec for convenient access.  Since descriptors consist of 
some
>   * number of output then some number of input descriptors, it's 
actually two
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 08d740a..4c9809e 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -122,9 +122,6 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned 
int 
> ioctl, unsigned long arg);
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
> 
> -int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem 
*heads,
> -           int datalen, unsigned int *iovcount, struct vhost_log *log,
> -           unsigned int *log_num);
>  unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
>              struct iovec iov[], unsigned int iov_count,
>              unsigned int *out_num, unsigned int *in_num,
> 
> -- 
> MST


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
  2010-04-29 13:45 ` Michael S. Tsirkin
@ 2010-04-29 14:12 ` Michael S. Tsirkin
  2010-05-03 10:34 ` Michael S. Tsirkin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-04-29 14:12 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, xiaohui.xin, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

You can find the latest version on the following net-next based tree:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

-- 
MST

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
  2010-04-29 13:45 ` Michael S. Tsirkin
  2010-04-29 14:12 ` Michael S. Tsirkin
@ 2010-05-03 10:34 ` Michael S. Tsirkin
  2010-05-03 15:39   ` David Stevens
  2010-05-03 22:48 ` Michael S. Tsirkin
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-03 10:34 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

I've been doing some more testing before sending out a pull
request, and I see a drastic performance degradation in guest to host
traffic when this is applied but mergeable buffers are not in used
by userspace (existing qemu-kvm userspace).

This is both with and without my patch on top.

Without patch:
[mst@tuck ~]$ sh runtest  2>&1 | tee ser-meregeable-disabled-kernel-only-tun-only.log
Starting netserver at port 12865
set_up_server could not establish a listen endpoint for  port 12865 with family AF_UNSPEC
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.0.0.4 (11.0.0.4) port 0 AF_INET : demo
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00      9107.26   89.20    33.85    0.802   2.436  

With patch:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.0.0.4 (11.0.0.4) port 0 AF_INET : demo
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384  16384    10.00        35.00   2.21     0.62     5.181   11.575 

For ease of testing, I put this on my tree
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost-broken

Please take a look.
Thanks!

-- 
MST

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-03 10:34 ` Michael S. Tsirkin
@ 2010-05-03 15:39   ` David Stevens
  2010-05-03 15:56     ` Michael S. Tsirkin
  0 siblings, 1 reply; 14+ messages in thread
From: David Stevens @ 2010-05-03 15:39 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, netdev, virtualization

"Michael S. Tsirkin" <mst@redhat.com> wrote on 05/03/2010 03:34:11 AM:

> On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > This patch adds mergeable receive buffer support to vhost_net.
> > 
> > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> 
> I've been doing some more testing before sending out a pull
> request, and I see a drastic performance degradation in guest to host
> traffic when this is applied but mergeable buffers are not in used
> by userspace (existing qemu-kvm userspace).

        Actually, I wouldn't expect it to work at all; the qemu-kvm
patch (particularly the feature bit setting bug fix) is required.
Without it, I think the existing code will tell the guest to use
mergeable buffers while turning it off in vhost. That was completely
non-functional for me -- what version of qemu-kvm are you using?
        What I did to test w/o mergeable buffers is turn off the
bit in VHOST_FEATURES. I'll recheck these, but qemu-kvm definitely
must be updated; the original doesn't correctly handle feature bits.

                                                                +-DLS


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-03 15:39   ` David Stevens
@ 2010-05-03 15:56     ` Michael S. Tsirkin
  2010-05-03 16:19       ` David Stevens
  0 siblings, 1 reply; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-03 15:56 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, netdev, virtualization

On Mon, May 03, 2010 at 08:39:08AM -0700, David Stevens wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 05/03/2010 03:34:11 AM:
> 
> > On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > > This patch adds mergeable receive buffer support to vhost_net.
> > > 
> > > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> > 
> > I've been doing some more testing before sending out a pull
> > request, and I see a drastic performance degradation in guest to host
> > traffic when this is applied but mergeable buffers are not in used
> > by userspace (existing qemu-kvm userspace).
> 
>         Actually, I wouldn't expect it to work at all;
> the qemu-kvm
> patch (particularly the feature bit setting bug fix) is required.

Which bugfix is that?

> Without it, I think the existing code will tell the guest to use
> mergeable buffers while turning it off in vhost.

It should not do that, specifically
vhost_net_get_features does:
	features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
unconditionally. This was added with:
5751995a20e77cd9d61d00f7390401895fa172a6

I forced mergeable buffers off with -global virtio-net-pci.mrg_rxbuf=off
and it did not seem to help either.

> That was completely
> non-functional for me -- what version of qemu-kvm are you using?

992cc816c433332f2e93db033919a9ddbfcd1da4 or later should work well
AFAIK.

>         What I did to test w/o mergeable buffers is turn off the
> bit in VHOST_FEATURES.

It should be enough to force mergeable buffers to off by qemu command
line: -global virtio-net-pci.mrg_rxbuf=off 

> I'll recheck these, but qemu-kvm definitely
> must be updated; the original doesn't correctly handle feature bits.
> 
>                                                                 +-DLS

Hmm, I don't see the bug.

-- 
MST

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-03 15:56     ` Michael S. Tsirkin
@ 2010-05-03 16:19       ` David Stevens
  0 siblings, 0 replies; 14+ messages in thread
From: David Stevens @ 2010-05-03 16:19 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, netdev, virtualization

"Michael S. Tsirkin" <mst@redhat.com> wrote on 05/03/2010 08:56:14 AM:

> On Mon, May 03, 2010 at 08:39:08AM -0700, David Stevens wrote:
> > "Michael S. Tsirkin" <mst@redhat.com> wrote on 05/03/2010 03:34:11 AM:
> > 
> > > On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > > > This patch adds mergeable receive buffer support to vhost_net.
> > > > 
> > > > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> > > 
> > > I've been doing some more testing before sending out a pull
> > > request, and I see a drastic performance degradation in guest to 
host
> > > traffic when this is applied but mergeable buffers are not in used
> > > by userspace (existing qemu-kvm userspace).
> > 
> >         Actually, I wouldn't expect it to work at all;
> > the qemu-kvm
> > patch (particularly the feature bit setting bug fix) is required.
> 
> Which bugfix is that?

        Actually, I see you put that upstream already--
commit dc14a397812b91dd0d48b03d1b8f66a251542369  in
Avi's tree is the one I was talking about.
        I'll look further.

                                                +-DLS


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
                   ` (2 preceding siblings ...)
  2010-05-03 10:34 ` Michael S. Tsirkin
@ 2010-05-03 22:48 ` Michael S. Tsirkin
  2010-05-10 16:43 ` Michael S. Tsirkin
  2010-05-10 17:31 ` Michael S. Tsirkin
  5 siblings, 0 replies; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-03 22:48 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> 
> diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v7/drivers/vhost/net.c
> --- net-next-v0/drivers/vhost/net.c	2010-04-24 21:36:54.000000000 -0700
> +++ net-next-v7/drivers/vhost/net.c	2010-04-28 12:26:18.000000000 -0700
> @@ -74,6 +74,23 @@ static int move_iovec_hdr(struct iovec *
>  	}
>  	return seg;
>  }
> +/* Copy iovec entries for len bytes from iovec. Return segments used. */
> +static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
> +			  size_t len, int iovcount)
> +{
> +	int seg = 0;
> +	size_t size;
> +	while (len && seg < iovcount) {
> +		size = min(from->iov_len, len);
> +		to->iov_base = from->iov_base;
> +		to->iov_len = size;
> +		len -= size;
> +		++from;
> +		++to;
> +		++seg;
> +	}
> +	return seg;
> +}
>  
>  /* Caller must have TX VQ lock */
>  static void tx_poll_stop(struct vhost_net *net)
> @@ -109,7 +126,7 @@ static void handle_tx(struct vhost_net *
>  	};
>  	size_t len, total_len = 0;
>  	int err, wmem;
> -	size_t hdr_size;
> +	size_t vhost_hlen;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock)
>  		return;
> @@ -128,13 +145,13 @@ static void handle_tx(struct vhost_net *
>  
>  	if (wmem < sock->sk->sk_sndbuf / 2)
>  		tx_poll_stop(net);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 NULL, NULL);
> +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> +				      ARRAY_SIZE(vq->iov),
> +				      &out, &in,
> +				      NULL, NULL);
>  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>  		if (head == vq->num) {
>  			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
> @@ -155,20 +172,20 @@ static void handle_tx(struct vhost_net *
>  			break;
>  		}
>  		/* Skip header. TODO: support TSO. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> +		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, out);
>  		msg.msg_iovlen = out;
>  		len = iov_length(vq->iov, out);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for TX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, 1);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
> @@ -187,12 +204,25 @@ static void handle_tx(struct vhost_net *
>  	unuse_mm(net->dev.mm);
>  }
>  
> +static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
> +{
> +	struct sk_buff *head;
> +	int len = 0;
> +
> +	lock_sock(sk);
> +	head = skb_peek(&sk->sk_receive_queue);
> +	if (head)
> +		len = head->len + vq->sock_hlen;
> +	release_sock(sk);
> +	return len;
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -	unsigned head, out, in, log, s;
> +	unsigned in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -203,14 +233,14 @@ static void handle_rx(struct vhost_net *
>  		.msg_flags = MSG_DONTWAIT,
>  	};
>  
> -	struct virtio_net_hdr hdr = {
> -		.flags = 0,
> -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> +		.hdr.flags = 0,
> +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  	};
>  
>  	size_t len, total_len = 0;
> -	int err;
> -	size_t hdr_size;
> +	int err, headcount, datalen;
> +	size_t vhost_hlen;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>  		return;
> @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
>  	use_mm(net->dev.mm);
>  	mutex_lock(&vq->mutex);
>  	vhost_disable_notify(vq);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(vq, sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads,
> +					     datalen + vhost_hlen,
> +					     &in, vq_log, &log);
> +		if (headcount < 0)
> +			break;
>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> +		if (!headcount) {
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
> @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> +		if (vhost_hlen)
> +			/* Skip header. TODO: support TSO. */
> +			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> +		else
> +			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		err = sock->ops->recvmsg(NULL, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
> -		/* TODO: Should check and handle checksum. */
> -		if (err > len) {
> -			pr_err("Discarded truncated rx packet: "
> -			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +		if (err != datalen) {
> +			pr_err("Discarded rx packet: "
> +			       " len %d, expected %zd\n", err, datalen);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> -		if (err) {
> -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> -			       vq->iov->iov_base, err);
> +		if (vhost_hlen &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> +				      vhost_hlen)) {
> +			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> +			       vq->iov->iov_base);
>  			break;
>  		}
> -		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> +				      offsetof(typeof(hdr), num_buffers),
> +				      sizeof(hdr.num_buffers))) {
> +			vq_err(vq, "Failed num_buffers write");
> +			vhost_discard_desc(vq, headcount);
> +			break;
> +		}
> +		len += vhost_hlen;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;
> @@ -561,9 +599,21 @@ done:
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -		sizeof(struct virtio_net_hdr) : 0;
> +	size_t vhost_hlen, sock_hlen, hdr_len;
>  	int i;
> +
> +	hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
> +			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
> +			sizeof(struct virtio_net_hdr);
> +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> +		/* vhost provides vnet_hdr */
> +		vhost_hlen = hdr_len;
> +		sock_hlen = 0;
> +	} else {
> +		/* socket provides vnet_hdr */
> +		vhost_hlen = 0;
> +		sock_hlen = hdr_len;
> +	}
>  	mutex_lock(&n->dev.mutex);
>  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
>  	    !vhost_log_access_ok(&n->dev)) {
> @@ -574,7 +624,8 @@ static int vhost_net_set_features(struct
>  	smp_wmb();
>  	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
>  		mutex_lock(&n->vqs[i].mutex);
> -		n->vqs[i].hdr_size = hdr_size;
> +		n->vqs[i].vhost_hlen = vhost_hlen;
> +		n->vqs[i].sock_hlen = sock_hlen;
>  		mutex_unlock(&n->vqs[i].mutex);
>  	}
>  	vhost_net_flush(n);
> diff -ruNp net-next-v0/drivers/vhost/vhost.c net-next-v7/drivers/vhost/vhost.c
> --- net-next-v0/drivers/vhost/vhost.c	2010-04-22 11:31:57.000000000 -0700
> +++ net-next-v7/drivers/vhost/vhost.c	2010-04-28 11:16:13.000000000 -0700
> @@ -114,7 +114,8 @@ static void vhost_vq_reset(struct vhost_
>  	vq->used_flags = 0;
>  	vq->log_used = false;
>  	vq->log_addr = -1ull;
> -	vq->hdr_size = 0;
> +	vq->vhost_hlen = 0;
> +	vq->sock_hlen = 0;
>  	vq->private_data = NULL;
>  	vq->log_base = NULL;
>  	vq->error_ctx = NULL;
> @@ -861,6 +862,53 @@ static unsigned get_indirect(struct vhos
>  	return 0;
>  }
>  
> +/* This is a multi-buffer version of vhost_get_desc
> + * @vq		- the relevant virtqueue
> + * datalen	- data length we'll be reading
> + * @iovcount	- returned count of io vectors we fill
> + * @log		- vhost log
> + * @log_num	- log offset
> + *	returns number of buffer heads allocated, negative on error
> + */
> +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> +		     int datalen, unsigned *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num)
> +{
> +	unsigned int out, in;
> +	int seg = 0;
> +	int headcount = 0;
> +	int r;
> +
> +	while (datalen > 0) {
> +		if (headcount >= VHOST_NET_MAX_SG) {
> +			r = -ENOBUFS;
> +			goto err;
> +		}
> +		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
> +					      ARRAY_SIZE(vq->iov) - seg, &out,
> +					      &in, log, log_num);

By the way, logging here looks broken to me.
Does live migration work for you?

log_num gets zeroed out on each call to vhost_get_desc.
I guess we could just change vhost_get_desc not to zero out
log_num at the start, do it here instead,
and pass in log + *log_num instead of log_num.

Need to also document the API for vhost_get_desc noting
that log_num is incremented, not stored to.

Pls think about it.


> +		if (heads[headcount].id == vq->num) {
> +			r = 0;
> +			goto err;
> +		}
> +		if (out || in <= 0) {
> +			vq_err(vq, "unexpected descriptor format for RX: "
> +				"out %d, in %d\n", out, in);
> +			r = -EINVAL;
> +			goto err;
> +		}
> +		heads[headcount].len = iov_length(vq->iov + seg, in);
> +		datalen -= heads[headcount].len;
> +		++headcount;
> +		seg += in;
> +	}
> +	*iovcount = seg;
> +	return headcount;
> +err:
> +	vhost_discard_desc(vq, headcount);
> +	return r;
> +}
> +
>  /* This looks in the virtqueue and for the first available buffer, and converts
>   * it to an iovec for convenient access.  Since descriptors consist of some
>   * number of output then some number of input descriptors, it's actually two
> @@ -868,7 +916,7 @@ static unsigned get_indirect(struct vhos
>   *
>   * This function returns the descriptor number found, or vq->num (which
>   * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
>  			   struct iovec iov[], unsigned int iov_size,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num)
> @@ -986,9 +1034,9 @@ unsigned vhost_get_vq_desc(struct vhost_
>  }
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
>  {
> -	vq->last_avail_idx--;
> +	vq->last_avail_idx -= n;
>  }
>  
>  /* After we've used one of their buffers, we tell them about it.  We'll then
> @@ -1033,6 +1081,68 @@ int vhost_add_used(struct vhost_virtqueu
>  	return 0;
>  }
>  
> +static void vhost_log_used(struct vhost_virtqueue *vq,
> +			   struct vring_used_elem __user *used)
> +{
> +	/* Make sure data is seen before log. */
> +	smp_wmb();
> +	/* Log used ring entry write. */
> +	log_write(vq->log_base,
> +		  vq->log_addr +
> +		   ((void __user *)used - (void __user *)vq->used),
> +		  sizeof *used);
> +	/* Log used index update. */
> +	log_write(vq->log_base,
> +		  vq->log_addr + offsetof(struct vring_used, idx),
> +		  sizeof vq->used->idx);
> +	if (vq->log_ctx)
> +		eventfd_signal(vq->log_ctx, 1);
> +}
> +
> +static int __vhost_add_used_n(struct vhost_virtqueue *vq,
> +			    struct vring_used_elem *heads,
> +			    unsigned count)
> +{
> +	struct vring_used_elem __user *used;
> +	int start;
> +
> +	start = vq->last_used_idx % vq->num;
> +	used = vq->used->ring + start;
> +	if (copy_to_user(used, heads, count * sizeof *used)) {
> +		vq_err(vq, "Failed to write used");
> +		return -EFAULT;
> +	}
> +	/* Make sure buffer is written before we update index. */
> +	smp_wmb();
> +	if (put_user(vq->last_used_idx + count, &vq->used->idx)) {
> +		vq_err(vq, "Failed to increment used idx");
> +		return -EFAULT;
> +	}
> +	if (unlikely(vq->log_used))
> +		vhost_log_used(vq, used);
> +	vq->last_used_idx += count;
> +	return 0;
> +}
> +
> +/* After we've used one of their buffers, we tell them about it.  We'll then
> + * want to notify the guest, using eventfd. */
> +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> +		     unsigned count)
> +{
> +	int start, n, r;
> +
> +	start = vq->last_used_idx % vq->num;
> +	n = vq->num - start;
> +	if (n < count) {
> +		r = __vhost_add_used_n(vq, heads, n);
> +		if (r < 0)
> +			return r;
> +		heads += n;
> +		count -= n;
> +	}
> +	return __vhost_add_used_n(vq, heads, count);
> +}
> +
>  /* This actually signals the guest, using eventfd. */
>  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  {
> @@ -1062,6 +1172,15 @@ void vhost_add_used_and_signal(struct vh
>  	vhost_signal(dev, vq);
>  }
>  
> +/* multi-buffer version of vhost_add_used_and_signal */
> +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct vring_used_elem *heads, unsigned count)
> +{
> +	vhost_add_used_n(vq, heads, count);
> +	vhost_signal(dev, vq);
> +}
> +
>  /* OK, now we need to know about added descriptors. */
>  bool vhost_enable_notify(struct vhost_virtqueue *vq)
>  {
> @@ -1086,7 +1205,7 @@ bool vhost_enable_notify(struct vhost_vi
>  		return false;
>  	}
>  
> -	return avail_idx != vq->last_avail_idx;
> +	return avail_idx != vq->avail_idx;
>  }
>  
>  /* We don't need to be notified again. */
> diff -ruNp net-next-v0/drivers/vhost/vhost.h net-next-v7/drivers/vhost/vhost.h
> --- net-next-v0/drivers/vhost/vhost.h	2010-04-24 21:37:41.000000000 -0700
> +++ net-next-v7/drivers/vhost/vhost.h	2010-04-26 10:35:25.000000000 -0700
> @@ -84,7 +84,9 @@ struct vhost_virtqueue {
>  	struct iovec indirect[VHOST_NET_MAX_SG];
>  	struct iovec iov[VHOST_NET_MAX_SG];
>  	struct iovec hdr[VHOST_NET_MAX_SG];
> -	size_t hdr_size;
> +	size_t vhost_hlen;
> +	size_t sock_hlen;
> +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
>  	/* We use a kind of RCU to access private pointer.
>  	 * All readers access it from workqueue, which makes it possible to
>  	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> @@ -120,16 +122,23 @@ long vhost_dev_ioctl(struct vhost_dev *,
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
>  
> -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> +		     int datalen, unsigned int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num);
> +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
>  			   struct iovec iov[], unsigned int iov_count,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> +void vhost_discard_desc(struct vhost_virtqueue *, int);
>  
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> +int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> +		     unsigned count);
>  void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
> -			       unsigned int head, int len);
> +			       unsigned int id, int len);
> +void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
> +			       struct vring_used_elem *heads, unsigned count);
> +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
>  void vhost_disable_notify(struct vhost_virtqueue *);
>  bool vhost_enable_notify(struct vhost_virtqueue *);
>  
> @@ -149,7 +158,8 @@ enum {
>  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>  			 (1 << VHOST_F_LOG_ALL) |
> -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
                   ` (3 preceding siblings ...)
  2010-05-03 22:48 ` Michael S. Tsirkin
@ 2010-05-10 16:43 ` Michael S. Tsirkin
  2010-05-10 17:09   ` David Stevens
  2010-05-10 17:31 ` Michael S. Tsirkin
  5 siblings, 1 reply; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-10 16:43 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
>  	use_mm(net->dev.mm);
>  	mutex_lock(&vq->mutex);
>  	vhost_disable_notify(vq);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(vq, sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads,
> +					     datalen + vhost_hlen,
> +					     &in, vq_log, &log);
> +		if (headcount < 0)
> +			break;
>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> +		if (!headcount) {
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
> @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> +		if (vhost_hlen)
> +			/* Skip header. TODO: support TSO. */
> +			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> +		else
> +			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		err = sock->ops->recvmsg(NULL, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
> -		/* TODO: Should check and handle checksum. */
> -		if (err > len) {
> -			pr_err("Discarded truncated rx packet: "
> -			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +		if (err != datalen) {
> +			pr_err("Discarded rx packet: "
> +			       " len %d, expected %zd\n", err, datalen);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> -		if (err) {
> -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> -			       vq->iov->iov_base, err);
> +		if (vhost_hlen &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> +				      vhost_hlen)) {
> +			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> +			       vq->iov->iov_base);
>  			break;
>  		}
> -		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> +				      offsetof(typeof(hdr), num_buffers),
> +				      sizeof(hdr.num_buffers))) {
> +			vq_err(vq, "Failed num_buffers write");
> +			vhost_discard_desc(vq, headcount);
> +			break;
> +		}
> +		len += vhost_hlen;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;

OK I think I see the bug here: vhost_add_used_and_signal_n
does not get the actual length, it gets the iovec length from vhost.
Guest virtio uses this as packet length, with bad results.

So I have applied the follows and it seems to have fixed the problem:

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c16db02..9d7496d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -219,7 +219,7 @@ static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
+ * @datalen	- data length we'll be reading. must be > 0
  * @iovcount	- returned count of io vectors we fill
  * @log		- vhost log
  * @log_num	- log offset
@@ -236,9 +236,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	int seg = 0;
 	int headcount = 0;
 	unsigned d;
+	size_t len;
 	int r, nlogs = 0;
 
-	while (datalen > 0) {
+	for (;;) {
 		if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
 			r = -ENOBUFS;
 			goto err;
@@ -260,16 +261,20 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
+		len = iov_length(vq->iov + seg, in);
+		seg += in;
 		heads[headcount].id = d;
-		heads[headcount].len = iov_length(vq->iov + seg, in);
-		datalen -= heads[headcount].len;
+		if (datalen <= len)
+			break;
+		heads[headcount].len = len;
 		++headcount;
-		seg += in;
+		datalen -= len;
 	}
+	heads[headcount].len = datalen;
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
-	return headcount;
+	return headcount + 1;
 err:
 	vhost_discard_desc(vq, headcount);
 	return r;

-- 
MST

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-10 16:43 ` Michael S. Tsirkin
@ 2010-05-10 17:09   ` David Stevens
  2010-05-10 17:25     ` Michael S. Tsirkin
  0 siblings, 1 reply; 14+ messages in thread
From: David Stevens @ 2010-05-10 17:09 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, virtualization

Since "datalen" carries the difference and will be negative by that amount
from the original loop, what about just adding something like:

        }
        if (headcount)
                heads[headcount-1].len += datalen;
[and really, headcount >0 since datalen > 0, so just:

        heads[headcount-1].len += datalen;

                                                +-DLS


kvm-owner@vger.kernel.org wrote on 05/10/2010 09:43:03 AM:

> On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
> >     use_mm(net->dev.mm);
> >     mutex_lock(&vq->mutex);
> >     vhost_disable_notify(vq);
> > -   hdr_size = vq->hdr_size;
> > +   vhost_hlen = vq->vhost_hlen;
> > 
> >     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >        vq->log : NULL;
> > 
> > -   for (;;) {
> > -      head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -                ARRAY_SIZE(vq->iov),
> > -                &out, &in,
> > -                vq_log, &log);
> > +   while ((datalen = vhost_head_len(vq, sock->sk))) {
> > +      headcount = vhost_get_desc_n(vq, vq->heads,
> > +                    datalen + vhost_hlen,
> > +                    &in, vq_log, &log);
> > +      if (headcount < 0)
> > +         break;
> >        /* OK, now we need to know about added descriptors. */
> > -      if (head == vq->num) {
> > +      if (!headcount) {
> >           if (unlikely(vhost_enable_notify(vq))) {
> >              /* They have slipped one in as we were
> >               * doing that: check again. */
> > @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
> >           break;
> >        }
> >        /* We don't need to be notified again. */
> > -      if (out) {
> > -         vq_err(vq, "Unexpected descriptor format for RX: "
> > -                "out %d, int %d\n",
> > -                out, in);
> > -         break;
> > -      }
> > -      /* Skip header. TODO: support TSO/mergeable rx buffers. */
> > -      s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > +      if (vhost_hlen)
> > +         /* Skip header. TODO: support TSO. */
> > +         s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> > +      else
> > +         s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
> >        msg.msg_iovlen = in;
> >        len = iov_length(vq->iov, in);
> >        /* Sanity check */
> >        if (!len) {
> >           vq_err(vq, "Unexpected header len for RX: "
> >                  "%zd expected %zd\n",
> > -                iov_length(vq->hdr, s), hdr_size);
> > +                iov_length(vq->hdr, s), vhost_hlen);
> >           break;
> >        }
> >        err = sock->ops->recvmsg(NULL, sock, &msg,
> >                  len, MSG_DONTWAIT | MSG_TRUNC);
> >        /* TODO: Check specific error and bomb out unless EAGAIN? */
> >        if (err < 0) {
> > -         vhost_discard_vq_desc(vq);
> > +         vhost_discard_desc(vq, headcount);
> >           break;
> >        }
> > -      /* TODO: Should check and handle checksum. */
> > -      if (err > len) {
> > -         pr_err("Discarded truncated rx packet: "
> > -                " len %d > %zd\n", err, len);
> > -         vhost_discard_vq_desc(vq);
> > +      if (err != datalen) {
> > +         pr_err("Discarded rx packet: "
> > +                " len %d, expected %zd\n", err, datalen);
> > +         vhost_discard_desc(vq, headcount);
> >           continue;
> >        }
> >        len = err;
> > -      err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> > -      if (err) {
> > -         vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> > -                vq->iov->iov_base, err);
> > +      if (vhost_hlen &&
> > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> > +                  vhost_hlen)) {
> > +         vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> > +                vq->iov->iov_base);
> >           break;
> >        }
> > -      len += hdr_size;
> > -      vhost_add_used_and_signal(&net->dev, vq, head, len);
> > +      /* TODO: Should check and handle checksum. */
> > +      if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> > +                  offsetof(typeof(hdr), num_buffers),
> > +                  sizeof(hdr.num_buffers))) {
> > +         vq_err(vq, "Failed num_buffers write");
> > +         vhost_discard_desc(vq, headcount);
> > +         break;
> > +      }
> > +      len += vhost_hlen;
> > +      vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> > +                   headcount);
> >        if (unlikely(vq_log))
> >           vhost_log_write(vq, vq_log, log, len);
> >        total_len += len;
> 
> OK I think I see the bug here: vhost_add_used_and_signal_n
> does not get the actual length, it gets the iovec length from vhost.
> Guest virtio uses this as packet length, with bad results.
> 
> So I have applied the follows and it seems to have fixed the problem:
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c16db02..9d7496d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -219,7 +219,7 @@ static int peek_head_len(struct vhost_virtqueue *vq, 

> struct sock *sk)
>  /* This is a multi-buffer version of vhost_get_desc, that works if
>   *   vq has read descriptors only.
>   * @vq      - the relevant virtqueue
> - * @datalen   - data length we'll be reading
> + * @datalen   - data length we'll be reading. must be > 0
>   * @iovcount   - returned count of io vectors we fill
>   * @log      - vhost log
>   * @log_num   - log offset
> @@ -236,9 +236,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>     int seg = 0;
>     int headcount = 0;
>     unsigned d;
> +   size_t len;
>     int r, nlogs = 0;
> 
> -   while (datalen > 0) {
> +   for (;;) {
>        if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
>           r = -ENOBUFS;
>           goto err;
> @@ -260,16 +261,20 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>           nlogs += *log_num;
>           log += *log_num;
>        }
> +      len = iov_length(vq->iov + seg, in);
> +      seg += in;
>        heads[headcount].id = d;
> -      heads[headcount].len = iov_length(vq->iov + seg, in);
> -      datalen -= heads[headcount].len;
> +      if (datalen <= len)
> +         break;
> +      heads[headcount].len = len;
>        ++headcount;
> -      seg += in;
> +      datalen -= len;
>     }
> +   heads[headcount].len = datalen;
>     *iovcount = seg;
>     if (unlikely(log))
>        *log_num = nlogs;
> -   return headcount;
> +   return headcount + 1;
>  err:
>     vhost_discard_desc(vq, headcount);
>     return r;
> 
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-10 17:09   ` David Stevens
@ 2010-05-10 17:25     ` Michael S. Tsirkin
  2010-05-10 17:46       ` David Stevens
  0 siblings, 1 reply; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-10 17:25 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, kvm-owner, netdev, virtualization

On Mon, May 10, 2010 at 10:09:03AM -0700, David Stevens wrote:
> Since "datalen" carries the difference and will be negative by that amount
> from the original loop, what about just adding something like:
> 
>         }
>         if (headcount)
>                 heads[headcount-1].len += datalen;
> [and really, headcount >0 since datalen > 0, so just:
> 
>         heads[headcount-1].len += datalen;
> 
>                                                 +-DLS

This works too, just does more checks and comparisons.
I am still surprised that you were unable to reproduce the problem.

> 
> kvm-owner@vger.kernel.org wrote on 05/10/2010 09:43:03 AM:
> 
> > On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > > @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
> > >     use_mm(net->dev.mm);
> > >     mutex_lock(&vq->mutex);
> > >     vhost_disable_notify(vq);
> > > -   hdr_size = vq->hdr_size;
> > > +   vhost_hlen = vq->vhost_hlen;
> > > 
> > >     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > >        vq->log : NULL;
> > > 
> > > -   for (;;) {
> > > -      head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > -                ARRAY_SIZE(vq->iov),
> > > -                &out, &in,
> > > -                vq_log, &log);
> > > +   while ((datalen = vhost_head_len(vq, sock->sk))) {
> > > +      headcount = vhost_get_desc_n(vq, vq->heads,
> > > +                    datalen + vhost_hlen,
> > > +                    &in, vq_log, &log);
> > > +      if (headcount < 0)
> > > +         break;
> > >        /* OK, now we need to know about added descriptors. */
> > > -      if (head == vq->num) {
> > > +      if (!headcount) {
> > >           if (unlikely(vhost_enable_notify(vq))) {
> > >              /* They have slipped one in as we were
> > >               * doing that: check again. */
> > > @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
> > >           break;
> > >        }
> > >        /* We don't need to be notified again. */
> > > -      if (out) {
> > > -         vq_err(vq, "Unexpected descriptor format for RX: "
> > > -                "out %d, int %d\n",
> > > -                out, in);
> > > -         break;
> > > -      }
> > > -      /* Skip header. TODO: support TSO/mergeable rx buffers. */
> > > -      s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > +      if (vhost_hlen)
> > > +         /* Skip header. TODO: support TSO. */
> > > +         s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> > > +      else
> > > +         s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
> > >        msg.msg_iovlen = in;
> > >        len = iov_length(vq->iov, in);
> > >        /* Sanity check */
> > >        if (!len) {
> > >           vq_err(vq, "Unexpected header len for RX: "
> > >                  "%zd expected %zd\n",
> > > -                iov_length(vq->hdr, s), hdr_size);
> > > +                iov_length(vq->hdr, s), vhost_hlen);
> > >           break;
> > >        }
> > >        err = sock->ops->recvmsg(NULL, sock, &msg,
> > >                  len, MSG_DONTWAIT | MSG_TRUNC);
> > >        /* TODO: Check specific error and bomb out unless EAGAIN? */
> > >        if (err < 0) {
> > > -         vhost_discard_vq_desc(vq);
> > > +         vhost_discard_desc(vq, headcount);
> > >           break;
> > >        }
> > > -      /* TODO: Should check and handle checksum. */
> > > -      if (err > len) {
> > > -         pr_err("Discarded truncated rx packet: "
> > > -                " len %d > %zd\n", err, len);
> > > -         vhost_discard_vq_desc(vq);
> > > +      if (err != datalen) {
> > > +         pr_err("Discarded rx packet: "
> > > +                " len %d, expected %zd\n", err, datalen);
> > > +         vhost_discard_desc(vq, headcount);
> > >           continue;
> > >        }
> > >        len = err;
> > > -      err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> > > -      if (err) {
> > > -         vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> > > -                vq->iov->iov_base, err);
> > > +      if (vhost_hlen &&
> > > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> > > +                  vhost_hlen)) {
> > > +         vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> > > +                vq->iov->iov_base);
> > >           break;
> > >        }
> > > -      len += hdr_size;
> > > -      vhost_add_used_and_signal(&net->dev, vq, head, len);
> > > +      /* TODO: Should check and handle checksum. */
> > > +      if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> > > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> > > +                  offsetof(typeof(hdr), num_buffers),
> > > +                  sizeof(hdr.num_buffers))) {
> > > +         vq_err(vq, "Failed num_buffers write");
> > > +         vhost_discard_desc(vq, headcount);
> > > +         break;
> > > +      }
> > > +      len += vhost_hlen;
> > > +      vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> > > +                   headcount);
> > >        if (unlikely(vq_log))
> > >           vhost_log_write(vq, vq_log, log, len);
> > >        total_len += len;
> > 
> > OK I think I see the bug here: vhost_add_used_and_signal_n
> > does not get the actual length, it gets the iovec length from vhost.
> > Guest virtio uses this as packet length, with bad results.
> > 
> > So I have applied the follows and it seems to have fixed the problem:
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index c16db02..9d7496d 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -219,7 +219,7 @@ static int peek_head_len(struct vhost_virtqueue *vq, 
> 
> > struct sock *sk)
> >  /* This is a multi-buffer version of vhost_get_desc, that works if
> >   *   vq has read descriptors only.
> >   * @vq      - the relevant virtqueue
> > - * @datalen   - data length we'll be reading
> > + * @datalen   - data length we'll be reading. must be > 0
> >   * @iovcount   - returned count of io vectors we fill
> >   * @log      - vhost log
> >   * @log_num   - log offset
> > @@ -236,9 +236,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >     int seg = 0;
> >     int headcount = 0;
> >     unsigned d;
> > +   size_t len;
> >     int r, nlogs = 0;
> > 
> > -   while (datalen > 0) {
> > +   for (;;) {
> >        if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
> >           r = -ENOBUFS;
> >           goto err;
> > @@ -260,16 +261,20 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> >           nlogs += *log_num;
> >           log += *log_num;
> >        }
> > +      len = iov_length(vq->iov + seg, in);
> > +      seg += in;
> >        heads[headcount].id = d;
> > -      heads[headcount].len = iov_length(vq->iov + seg, in);
> > -      datalen -= heads[headcount].len;
> > +      if (datalen <= len)
> > +         break;
> > +      heads[headcount].len = len;
> >        ++headcount;
> > -      seg += in;
> > +      datalen -= len;
> >     }
> > +   heads[headcount].len = datalen;
> >     *iovcount = seg;
> >     if (unlikely(log))
> >        *log_num = nlogs;
> > -   return headcount;
> > +   return headcount + 1;
> >  err:
> >     vhost_discard_desc(vq, headcount);
> >     return r;
> > 
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-05-10 17:25     ` Michael S. Tsirkin
@ 2010-05-10 17:46       ` David Stevens
  0 siblings, 0 replies; 14+ messages in thread
From: David Stevens @ 2010-05-10 17:46 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, netdev-owner, virtualization

netdev-owner@vger.kernel.org wrote on 05/10/2010 10:25:57 AM:

> On Mon, May 10, 2010 at 10:09:03AM -0700, David Stevens wrote:
> > Since "datalen" carries the difference and will be negative by that 
amount
> > from the original loop, what about just adding something like:
> > 
> >         }
> >         if (headcount)
> >                 heads[headcount-1].len += datalen;
> > [and really, headcount >0 since datalen > 0, so just:
> > 
> >         heads[headcount-1].len += datalen;
> > 
> >                                                 +-DLS
> 
> This works too, just does more checks and comparisons.
> I am still surprised that you were unable to reproduce the problem.
> 

I'm sure it happened, and probably had a performance
penalty on my systems too, but not as much as yours.
I didn't see any obvious performance difference running
with the patch, though; not sure why. I'll instrument to
see how often it's happening, I think.
        But fixed now, good catch!

                                                        +-DLS


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv7] add mergeable buffers support to vhost_net
  2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
                   ` (4 preceding siblings ...)
  2010-05-10 16:43 ` Michael S. Tsirkin
@ 2010-05-10 17:31 ` Michael S. Tsirkin
  5 siblings, 0 replies; 14+ messages in thread
From: Michael S. Tsirkin @ 2010-05-10 17:31 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
>  	use_mm(net->dev.mm);
>  	mutex_lock(&vq->mutex);
>  	vhost_disable_notify(vq);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(vq, sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads,
> +					     datalen + vhost_hlen,
> +					     &in, vq_log, &log);
> +		if (headcount < 0)
> +			break;
>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> +		if (!headcount) {
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */

So I think this breaks handling for a failure mode where
we get an skb that is larger than the max packet guest
can get. The right thing to do in this case is to
drop the skb, we currently do this by passing
truncate flag to recvmsg.

In particular, with mergeable buffers off, if we get an skb
that does not fit in a single packet, this code will
spread it over multiple buffers.

You should be able to reproduce this fairly easily
by disabling both indirect buffers and mergeable buffers
on qemu command line. With current code TCP still
works by falling back on small packets. I think
with your code it will get stuck forever once
we get an skb that is too large for us to handle.

-- 
MST

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2010-05-10 17:46 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-28 20:57 [PATCHv7] add mergeable buffers support to vhost_net David L Stevens
2010-04-29 13:45 ` Michael S. Tsirkin
2010-04-30 16:48   ` David Stevens
2010-04-29 14:12 ` Michael S. Tsirkin
2010-05-03 10:34 ` Michael S. Tsirkin
2010-05-03 15:39   ` David Stevens
2010-05-03 15:56     ` Michael S. Tsirkin
2010-05-03 16:19       ` David Stevens
2010-05-03 22:48 ` Michael S. Tsirkin
2010-05-10 16:43 ` Michael S. Tsirkin
2010-05-10 17:09   ` David Stevens
2010-05-10 17:25     ` Michael S. Tsirkin
2010-05-10 17:46       ` David Stevens
2010-05-10 17:31 ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).