netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net
@ 2010-03-03  0:20 David Stevens
  2010-03-07 16:26 ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: David Stevens @ 2010-03-03  0:20 UTC (permalink / raw)
  To: mst, rusty; +Cc: netdev, kvm, virtualization

[-- Attachment #1: Type: text/plain, Size: 7770 bytes --]

This patch glues them all together and makes sure we
notify whenever we don't have enough buffers to receive
a max-sized packet, and adds the feature bit.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruN net-next-p2/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
--- net-next-p2/drivers/vhost/net.c     2010-03-02 13:01:34.000000000 
-0800
+++ net-next-p3/drivers/vhost/net.c     2010-03-02 15:25:15.000000000 
-0800
@@ -54,26 +54,6 @@
        enum vhost_net_poll_state tx_poll_state;
 };
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-                         size_t len, int iov_count)
-{
-       int seg = 0;
-       size_t size;
-       while (len && seg < iov_count) {
-               size = min(from->iov_len, len);
-               to->iov_base = from->iov_base;
-               to->iov_len = size;
-               from->iov_len -= size;
-               from->iov_base += size;
-               len -= size;
-               ++from;
-               ++to;
-               ++seg;
-       }
-       return seg;
-}
-
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
 {
@@ -97,7 +77,7 @@
 static void handle_tx(struct vhost_net *net)
 {
        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
-       unsigned out, in, s;
+       unsigned out, in;
        struct iovec head;
        struct msghdr msg = {
                .msg_name = NULL,
@@ -110,6 +90,7 @@
        size_t len, total_len = 0;
        int err, wmem;
        struct socket *sock = rcu_dereference(vq->private_data);
+
        if (!sock)
                return;
 
@@ -166,11 +147,11 @@
                /* Skip header. TODO: support TSO. */
                msg.msg_iovlen = out;
                head.iov_len = len = iov_length(vq->iov, out);
+
                /* Sanity check */
                if (!len) {
                        vq_err(vq, "Unexpected header len for TX: "
-                              "%zd expected %zd\n",
-                              len, vq->guest_hlen);
+                              "%zd expected %zd\n", len, vq->guest_hlen);
                        break;
                }
                /* TODO: Check specific error and bomb out unless ENOBUFS? 
*/
@@ -214,7 +195,7 @@
 static void handle_rx(struct vhost_net *net)
 {
        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-       unsigned in, log, s;
+       unsigned in, log;
        struct vhost_log *vq_log;
        struct msghdr msg = {
                .msg_name = NULL,
@@ -245,30 +226,36 @@
                if (!headcount) {
                        vhost_enable_notify(vq);
                        break;
-               }
+               } else if (vq->maxheadcount < headcount)
+                       vq->maxheadcount = headcount;
                /* Skip header. TODO: support TSO/mergeable rx buffers. */
                msg.msg_iovlen = in;
                len = iov_length(vq->iov, in);
-
                /* Sanity check */
                if (!len) {
                        vq_err(vq, "Unexpected header len for RX: "
-                              "%zd expected %zd\n",
-                              len, vq->guest_hlen);
+                              "%zd expected %zd\n", len, vq->guest_hlen);
                        break;
                }
                err = sock->ops->recvmsg(NULL, sock, &msg,
                                         len, MSG_DONTWAIT | MSG_TRUNC);
-               /* TODO: Check specific error and bomb out unless EAGAIN? 
*/
                if (err < 0) {
-                       vhost_discard(vq, 1);
+                       vhost_discard(vq, headcount);
                        break;
                }
                /* TODO: Should check and handle checksum. */
+               if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) 
{
+                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
+                               (struct virtio_net_hdr_mrg_rxbuf *)
+                               vq->iov[0].iov_base;
+                       /* add num_bufs */
+                       vq->iov[0].iov_len = vq->guest_hlen;
+                       vhdr->num_buffers = headcount;
+               }
                if (err > len) {
                        pr_err("Discarded truncated rx packet: "
                               " len %d > %zd\n", err, len);
-                       vhost_discard(vq, 1);
+                       vhost_discard(vq, headcount);
                        continue;
                }
                len = err;
@@ -573,8 +560,6 @@
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-       size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-               sizeof(struct virtio_net_hdr) : 0;
        int i;
        mutex_lock(&n->dev.mutex);
        if ((features & (1 << VHOST_F_LOG_ALL)) &&
diff -ruN net-next-p2/drivers/vhost/vhost.c 
net-next-p3/drivers/vhost/vhost.c
--- net-next-p2/drivers/vhost/vhost.c   2010-03-02 12:53:02.000000000 
-0800
+++ net-next-p3/drivers/vhost/vhost.c   2010-03-02 15:24:50.000000000 
-0800
@@ -115,6 +115,7 @@
        vq->log_addr = -1ull;
        vq->guest_hlen = 0;
        vq->sock_hlen = 0;
+       vq->maxheadcount = 0;
        vq->private_data = NULL;
        vq->log_base = NULL;
        vq->error_ctx = NULL;
@@ -410,6 +411,7 @@
                vq->last_avail_idx = s.num;
                /* Forget the cached index value. */
                vq->avail_idx = vq->last_avail_idx;
+               vq->maxheadcount = 0;
                break;
        case VHOST_GET_VRING_BASE:
                s.index = idx;
@@ -1114,10 +1116,23 @@
        return 0;
 }
 
+int vhost_available(struct vhost_virtqueue *vq)
+{
+       int avail;
+
+       if (!vq->maxheadcount)  /* haven't got any yet */
+               return 1;
+       avail = vq->avail_idx - vq->last_avail_idx;
+       if (avail < 0)
+               avail += 0x10000; /* wrapped */
+       return avail;
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
        __u16 flags = 0;
+
        if (get_user(flags, &vq->avail->flags)) {
                vq_err(vq, "Failed to get flags");
                return;
@@ -1125,7 +1140,7 @@
 
        /* If they don't want an interrupt, don't signal, unless empty. */
        if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
-           (vq->avail_idx != vq->last_avail_idx ||
+           (vhost_available(vq) > vq->maxheadcount ||
             !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
                return;
 
diff -ruN net-next-p2/drivers/vhost/vhost.h 
net-next-p3/drivers/vhost/vhost.h
--- net-next-p2/drivers/vhost/vhost.h   2010-03-02 13:02:03.000000000 
-0800
+++ net-next-p3/drivers/vhost/vhost.h   2010-03-02 14:29:44.000000000 
-0800
@@ -85,6 +85,7 @@
        struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
        struct iovec heads[VHOST_NET_MAX_SG];
        size_t guest_hlen, sock_hlen;
+       int maxheadcount;
        /* We use a kind of RCU to access private pointer.
         * All readers access it from workqueue, which makes it possible 
to
         * flush the workqueue instead of synchronize_rcu. Therefore 
readers do
@@ -151,7 +152,8 @@
        VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
                         (1 << VIRTIO_RING_F_INDIRECT_DESC) |
                         (1 << VHOST_F_LOG_ALL) |
-                        (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+                        (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+                        (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)


[-- Attachment #2: MRXB3.patch --]
[-- Type: application/octet-stream, Size: 5852 bytes --]

diff -ruN net-next-p2/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
--- net-next-p2/drivers/vhost/net.c	2010-03-02 13:01:34.000000000 -0800
+++ net-next-p3/drivers/vhost/net.c	2010-03-02 15:25:15.000000000 -0800
@@ -54,26 +54,6 @@
 	enum vhost_net_poll_state tx_poll_state;
 };
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-			  size_t len, int iov_count)
-{
-	int seg = 0;
-	size_t size;
-	while (len && seg < iov_count) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		from->iov_len -= size;
-		from->iov_base += size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-	return seg;
-}
-
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
 {
@@ -97,7 +77,7 @@
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
-	unsigned out, in, s;
+	unsigned out, in;
 	struct iovec head;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -110,6 +90,7 @@
 	size_t len, total_len = 0;
 	int err, wmem;
 	struct socket *sock = rcu_dereference(vq->private_data);
+
 	if (!sock)
 		return;
 
@@ -166,11 +147,11 @@
 		/* Skip header. TODO: support TSO. */
 		msg.msg_iovlen = out;
 		head.iov_len = len = iov_length(vq->iov, out);
+
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
-			       "%zd expected %zd\n",
-			       len, vq->guest_hlen);
+			       "%zd expected %zd\n", len, vq->guest_hlen);
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
@@ -214,7 +195,7 @@
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned in, log, s;
+	unsigned in, log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -245,30 +226,36 @@
 		if (!headcount) {
 			vhost_enable_notify(vq);
 			break;
-		}
+		} else if (vq->maxheadcount < headcount)
+			vq->maxheadcount = headcount;
 		/* Skip header. TODO: support TSO/mergeable rx buffers. */
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
-
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
-			       "%zd expected %zd\n",
-			       len, vq->guest_hlen);
+			       "%zd expected %zd\n", len, vq->guest_hlen);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
-		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard(vq, 1);
+			vhost_discard(vq, headcount);
 			break;
 		}
 		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
+			struct virtio_net_hdr_mrg_rxbuf *vhdr =
+				(struct virtio_net_hdr_mrg_rxbuf *)
+				vq->iov[0].iov_base;
+			/* add num_bufs */
+			vq->iov[0].iov_len = vq->guest_hlen;
+			vhdr->num_buffers = headcount;
+		}
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
 			       " len %d > %zd\n", err, len);
-			vhost_discard(vq, 1);
+			vhost_discard(vq, headcount);
 			continue;
 		}
 		len = err;
@@ -573,8 +560,6 @@
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
 	int i;
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
diff -ruN net-next-p2/drivers/vhost/vhost.c net-next-p3/drivers/vhost/vhost.c
--- net-next-p2/drivers/vhost/vhost.c	2010-03-02 12:53:02.000000000 -0800
+++ net-next-p3/drivers/vhost/vhost.c	2010-03-02 15:24:50.000000000 -0800
@@ -115,6 +115,7 @@
 	vq->log_addr = -1ull;
 	vq->guest_hlen = 0;
 	vq->sock_hlen = 0;
+	vq->maxheadcount = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -410,6 +411,7 @@
 		vq->last_avail_idx = s.num;
 		/* Forget the cached index value. */
 		vq->avail_idx = vq->last_avail_idx;
+		vq->maxheadcount = 0;
 		break;
 	case VHOST_GET_VRING_BASE:
 		s.index = idx;
@@ -1114,10 +1116,23 @@
 	return 0;
 }
 
+int vhost_available(struct vhost_virtqueue *vq)
+{
+	int avail;
+
+	if (!vq->maxheadcount)	/* haven't got any yet */
+		return 1;
+	avail = vq->avail_idx - vq->last_avail_idx;
+	if (avail < 0)
+		avail += 0x10000; /* wrapped */
+	return avail;
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
 	__u16 flags = 0;
+
 	if (get_user(flags, &vq->avail->flags)) {
 		vq_err(vq, "Failed to get flags");
 		return;
@@ -1125,7 +1140,7 @@
 
 	/* If they don't want an interrupt, don't signal, unless empty. */
 	if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
-	    (vq->avail_idx != vq->last_avail_idx ||
+	    (vhost_available(vq) > vq->maxheadcount ||
 	     !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
 		return;
 
diff -ruN net-next-p2/drivers/vhost/vhost.h net-next-p3/drivers/vhost/vhost.h
--- net-next-p2/drivers/vhost/vhost.h	2010-03-02 13:02:03.000000000 -0800
+++ net-next-p3/drivers/vhost/vhost.h	2010-03-02 14:29:44.000000000 -0800
@@ -85,6 +85,7 @@
 	struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
 	struct iovec heads[VHOST_NET_MAX_SG];
 	size_t guest_hlen, sock_hlen;
+	int maxheadcount;
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
@@ -151,7 +152,8 @@
 	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
 			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
 			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net
  2010-03-03  0:20 [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net David Stevens
@ 2010-03-07 16:26 ` Michael S. Tsirkin
  2010-03-08  2:06   ` David Stevens
  0 siblings, 1 reply; 10+ messages in thread
From: Michael S. Tsirkin @ 2010-03-07 16:26 UTC (permalink / raw)
  To: David Stevens; +Cc: rusty, netdev, kvm, virtualization

On Tue, Mar 02, 2010 at 05:20:34PM -0700, David Stevens wrote:
> This patch glues them all together and makes sure we
> notify whenever we don't have enough buffers to receive
> a max-sized packet, and adds the feature bit.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

Maybe split this up?

> diff -ruN net-next-p2/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
> --- net-next-p2/drivers/vhost/net.c     2010-03-02 13:01:34.000000000 
> -0800
> +++ net-next-p3/drivers/vhost/net.c     2010-03-02 15:25:15.000000000 
> -0800
> @@ -54,26 +54,6 @@
>         enum vhost_net_poll_state tx_poll_state;
>  };
>  
> -/* Pop first len bytes from iovec. Return number of segments used. */
> -static int move_iovec_hdr(struct iovec *from, struct iovec *to,
> -                         size_t len, int iov_count)
> -{
> -       int seg = 0;
> -       size_t size;
> -       while (len && seg < iov_count) {
> -               size = min(from->iov_len, len);
> -               to->iov_base = from->iov_base;
> -               to->iov_len = size;
> -               from->iov_len -= size;
> -               from->iov_base += size;
> -               len -= size;
> -               ++from;
> -               ++to;
> -               ++seg;
> -       }
> -       return seg;
> -}
> -
>  /* Caller must have TX VQ lock */
>  static void tx_poll_stop(struct vhost_net *net)
>  {
> @@ -97,7 +77,7 @@
>  static void handle_tx(struct vhost_net *net)
>  {
>         struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> -       unsigned out, in, s;
> +       unsigned out, in;
>         struct iovec head;
>         struct msghdr msg = {
>                 .msg_name = NULL,
> @@ -110,6 +90,7 @@
>         size_t len, total_len = 0;
>         int err, wmem;
>         struct socket *sock = rcu_dereference(vq->private_data);
> +

I tend not to add empty lines if line below it is already short.

>         if (!sock)
>                 return;
>  
> @@ -166,11 +147,11 @@
>                 /* Skip header. TODO: support TSO. */
>                 msg.msg_iovlen = out;
>                 head.iov_len = len = iov_length(vq->iov, out);
> +

I tend not to add empty lines if line below it is a comment.

>                 /* Sanity check */
>                 if (!len) {
>                         vq_err(vq, "Unexpected header len for TX: "
> -                              "%zd expected %zd\n",
> -                              len, vq->guest_hlen);
> +                              "%zd expected %zd\n", len, vq->guest_hlen);
>                         break;
>                 }
>                 /* TODO: Check specific error and bomb out unless ENOBUFS? 
> */
> @@ -214,7 +195,7 @@
>  static void handle_rx(struct vhost_net *net)
>  {
>         struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -       unsigned in, log, s;
> +       unsigned in, log;
>         struct vhost_log *vq_log;
>         struct msghdr msg = {
>                 .msg_name = NULL,
> @@ -245,30 +226,36 @@
>                 if (!headcount) {
>                         vhost_enable_notify(vq);
>                         break;
> -               }
> +               } else if (vq->maxheadcount < headcount)
> +                       vq->maxheadcount = headcount;
>                 /* Skip header. TODO: support TSO/mergeable rx buffers. */
>                 msg.msg_iovlen = in;
>                 len = iov_length(vq->iov, in);
> -
>                 /* Sanity check */
>                 if (!len) {
>                         vq_err(vq, "Unexpected header len for RX: "
> -                              "%zd expected %zd\n",
> -                              len, vq->guest_hlen);
> +                              "%zd expected %zd\n", len, vq->guest_hlen);
>                         break;
>                 }
>                 err = sock->ops->recvmsg(NULL, sock, &msg,
>                                          len, MSG_DONTWAIT | MSG_TRUNC);
> -               /* TODO: Check specific error and bomb out unless EAGAIN? 
> */
>                 if (err < 0) {
> -                       vhost_discard(vq, 1);
> +                       vhost_discard(vq, headcount);
>                         break;
>                 }
>                 /* TODO: Should check and handle checksum. */
> +               if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) 
> {
> +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> +                               (struct virtio_net_hdr_mrg_rxbuf *)
> +                               vq->iov[0].iov_base;
> +                       /* add num_bufs */
> +                       vq->iov[0].iov_len = vq->guest_hlen;
> +                       vhdr->num_buffers = headcount;

I don't understand this. iov_base is a userspace pointer, isn't it.
How can you assign values to it like that?
Rusty also commented earlier that it's not a good idea to assume
specific layout, such as first chunk being large enough to
include virtio_net_hdr_mrg_rxbuf.

I think we need to use memcpy to/from iovec etc.

> +               }
>                 if (err > len) {
>                         pr_err("Discarded truncated rx packet: "
>                                " len %d > %zd\n", err, len);
> -                       vhost_discard(vq, 1);
> +                       vhost_discard(vq, headcount);
>                         continue;
>                 }
>                 len = err;
> @@ -573,8 +560,6 @@
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -       size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -               sizeof(struct virtio_net_hdr) : 0;
>         int i;
>         mutex_lock(&n->dev.mutex);
>         if ((features & (1 << VHOST_F_LOG_ALL)) &&
> diff -ruN net-next-p2/drivers/vhost/vhost.c 
> net-next-p3/drivers/vhost/vhost.c
> --- net-next-p2/drivers/vhost/vhost.c   2010-03-02 12:53:02.000000000 
> -0800
> +++ net-next-p3/drivers/vhost/vhost.c   2010-03-02 15:24:50.000000000 
> -0800
> @@ -115,6 +115,7 @@
>         vq->log_addr = -1ull;
>         vq->guest_hlen = 0;
>         vq->sock_hlen = 0;
> +       vq->maxheadcount = 0;
>         vq->private_data = NULL;
>         vq->log_base = NULL;
>         vq->error_ctx = NULL;
> @@ -410,6 +411,7 @@
>                 vq->last_avail_idx = s.num;
>                 /* Forget the cached index value. */
>                 vq->avail_idx = vq->last_avail_idx;
> +               vq->maxheadcount = 0;
>                 break;
>         case VHOST_GET_VRING_BASE:
>                 s.index = idx;
> @@ -1114,10 +1116,23 @@
>         return 0;
>  }
>  
> +int vhost_available(struct vhost_virtqueue *vq)
> +{
> +       int avail;
> +
> +       if (!vq->maxheadcount)  /* haven't got any yet */
> +               return 1;
> +       avail = vq->avail_idx - vq->last_avail_idx;
> +       if (avail < 0)
> +               avail += 0x10000; /* wrapped */
> +       return avail;
> +}
> +
>  /* This actually signals the guest, using eventfd. */
>  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  {
>         __u16 flags = 0;
> +

I tend not to add empty lines if a line above it is already short.

>         if (get_user(flags, &vq->avail->flags)) {
>                 vq_err(vq, "Failed to get flags");
>                 return;
> @@ -1125,7 +1140,7 @@
>  
>         /* If they don't want an interrupt, don't signal, unless empty. */
>         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> -           (vq->avail_idx != vq->last_avail_idx ||
> +           (vhost_available(vq) > vq->maxheadcount ||

I don't understand this change. It seems to make
code not match the comments.

>              !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
>                 return;
>  
> diff -ruN net-next-p2/drivers/vhost/vhost.h 
> net-next-p3/drivers/vhost/vhost.h
> --- net-next-p2/drivers/vhost/vhost.h   2010-03-02 13:02:03.000000000 
> -0800
> +++ net-next-p3/drivers/vhost/vhost.h   2010-03-02 14:29:44.000000000 
> -0800
> @@ -85,6 +85,7 @@
>         struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
>         struct iovec heads[VHOST_NET_MAX_SG];
>         size_t guest_hlen, sock_hlen;
> +       int maxheadcount;

I don't completely understand what does this field does.
It seems to be only set on rx? Maybe name should reflect this?

>         /* We use a kind of RCU to access private pointer.
>          * All readers access it from workqueue, which makes it possible 
> to
>          * flush the workqueue instead of synchronize_rcu. Therefore 
> readers do
> @@ -151,7 +152,8 @@
>         VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>                          (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>                          (1 << VHOST_F_LOG_ALL) |
> -                        (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +                        (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +                        (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net
  2010-03-07 16:26 ` Michael S. Tsirkin
@ 2010-03-08  2:06   ` David Stevens
  2010-03-08  8:07     ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: David Stevens @ 2010-03-08  2:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, netdev, rusty, virtualization

"Michael S. Tsirkin" <mst@redhat.com> wrote on 03/07/2010 08:26:33 AM:

> On Tue, Mar 02, 2010 at 05:20:34PM -0700, David Stevens wrote:
> > This patch glues them all together and makes sure we
> > notify whenever we don't have enough buffers to receive
> > a max-sized packet, and adds the feature bit.
> > 
> > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> 
> Maybe split this up?

        I can. I was looking mostly at size (and this is the smallest
of the bunch). But the feature requires all of them together, of course.
This last one is just "everything left over" from the other two.

> > @@ -110,6 +90,7 @@
> >         size_t len, total_len = 0;
> >         int err, wmem;
> >         struct socket *sock = rcu_dereference(vq->private_data);
> > +
> 
> I tend not to add empty lines if line below it is already short.

        This leaves no blank line between the declarations and the start
of code. It's habit for me-- not sure of kernel coding standards address
that or not, but I don't think I've seen it anywhere else.

> 
> >         if (!sock)
> >                 return;
> > 
> > @@ -166,11 +147,11 @@
> >                 /* Skip header. TODO: support TSO. */
> >                 msg.msg_iovlen = out;
> >                 head.iov_len = len = iov_length(vq->iov, out);
> > +
> 
> I tend not to add empty lines if line below it is a comment.

        I added this to separate the logical "skip header" block from
the next, unrelated piece. Not important to me, though.

> 
> >                 /* Sanity check */
> >                 if (!len) {
> >                         vq_err(vq, "Unexpected header len for TX: "
> > -                              "%zd expected %zd\n",
> > -                              len, vq->guest_hlen);
> > +                              "%zd expected %zd\n", len, 
vq->guest_hlen);
> >                         break;
> >                 }
> >                 /* TODO: Check specific error and bomb out unless 
ENOBUFS? 
> > */


> >                 /* TODO: Should check and handle checksum. */
> > +               if (vhost_has_feature(&net->dev, 
VIRTIO_NET_F_MRG_RXBUF)) 
> > {
> > +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +                               (struct virtio_net_hdr_mrg_rxbuf *)
> > +                               vq->iov[0].iov_base;
> > +                       /* add num_bufs */
> > +                       vq->iov[0].iov_len = vq->guest_hlen;
> > +                       vhdr->num_buffers = headcount;
> 
> I don't understand this. iov_base is a userspace pointer, isn't it.
> How can you assign values to it like that?
> Rusty also commented earlier that it's not a good idea to assume
> specific layout, such as first chunk being large enough to
> include virtio_net_hdr_mrg_rxbuf.
> 
> I think we need to use memcpy to/from iovec etc.

        I guess you mean put_user() or copy_to_user(); yes, I suppose
it could be paged since we read it.
        The code doesn't assume that it'll fit so much as arranged for
it to fit. We allocate guest_hlen bytes in the buffer, but set the
iovec to the (smaller) sock_hlen; do the read, then this code adds
back the 2 bytes in the middle that we didn't read into (where
num_buffers goes). But the allocator does require that guest_hlen
will fit in a single buffer (and reports error if it doesn't). The
alternative is significantly more complicated, and only fails if
the guest doesn't give us at least the buffer size the guest header
requires (a truly lame guest). I'm not sure it's worth a lot of
complexity in vhost to support the guest giving us <12 byte buffers;
those guests don't exist now and maybe they never should?


> >  /* This actually signals the guest, using eventfd. */
> >  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> >  {
> >         __u16 flags = 0;
> > +
> 
> I tend not to add empty lines if a line above it is already short.

        Again, separating declarations from code-- never seen different
in any other kernel code.

> 
> >         if (get_user(flags, &vq->avail->flags)) {
> >                 vq_err(vq, "Failed to get flags");
> >                 return;
> > @@ -1125,7 +1140,7 @@
> > 
> >         /* If they don't want an interrupt, don't signal, unless 
empty. */
> >         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > -           (vq->avail_idx != vq->last_avail_idx ||
> > +           (vhost_available(vq) > vq->maxheadcount ||
> 
> I don't understand this change. It seems to make
> code not match the comments.

        It redefines "empty". Without mergeable buffers, we can empty
the ring down to nothing before we require notification. With
mergeable buffers, if the packet requires, say, 3 buffers, and we
have only 2 left, we are empty and require notification and new
buffers to read anything. In both cases, we notify when we can't
read another packet without more buffers.
        I can think about changing the comment to reflect this more
clearly.

> 
> >              !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
> >                 return;
> > 
> > diff -ruN net-next-p2/drivers/vhost/vhost.h 
> > net-next-p3/drivers/vhost/vhost.h
> > --- net-next-p2/drivers/vhost/vhost.h   2010-03-02 13:02:03.000000000 
> > -0800
> > +++ net-next-p3/drivers/vhost/vhost.h   2010-03-02 14:29:44.000000000 
> > -0800
> > @@ -85,6 +85,7 @@
> >         struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr 
*/
> >         struct iovec heads[VHOST_NET_MAX_SG];
> >         size_t guest_hlen, sock_hlen;
> > +       int maxheadcount;
> 
> I don't completely understand what does this field does.
> It seems to be only set on rx? Maybe name should reflect this?

        This is a way for me to dynamically guess how many heads I need 
for a
max-sized packet for whatever the MTU/GRO settings are without waiting to
detect the need for more buffers until a read fails. Without mergeable 
buffers,
we can always fit a max-sized packet in 1 head, but with, we need more 
than
one head to do the read.

I didn't want to hard-code 64K (which it usually is, but not always), and
I didn't want to wait until a read fails every time the ring is near full.
I played with re-enabling notify on 1/4 available or some such, but that 
delays
reads unnecessarily, so I came up with this method: use maxheadcount to 
track
the biggest packet we've ever seen and always make sure we have at least 
that
many available for the next read. If it increases, we may fail the read, 
which'll
notify, but this allows us to notify before we try and fail in normal 
operation,
while still not doing a notify on every read.

                                                                +-DLS


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net
  2010-03-08  2:06   ` David Stevens
@ 2010-03-08  8:07     ` Michael S. Tsirkin
  2010-03-31  1:23       ` [PATCH v2] Add Mergeable RX buffer feature to vhost_net David Stevens
  0 siblings, 1 reply; 10+ messages in thread
From: Michael S. Tsirkin @ 2010-03-08  8:07 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, netdev, rusty, virtualization

On Sun, Mar 07, 2010 at 06:06:51PM -0800, David Stevens wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 03/07/2010 08:26:33 AM:
> 
> > On Tue, Mar 02, 2010 at 05:20:34PM -0700, David Stevens wrote:
> > > This patch glues them all together and makes sure we
> > > notify whenever we don't have enough buffers to receive
> > > a max-sized packet, and adds the feature bit.
> > > 
> > > Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
> > 
> > Maybe split this up?
> 
>         I can. I was looking mostly at size (and this is the smallest
> of the bunch). But the feature requires all of them together, of course.
> This last one is just "everything left over" from the other two.
> 
> > > @@ -110,6 +90,7 @@
> > >         size_t len, total_len = 0;
> > >         int err, wmem;
> > >         struct socket *sock = rcu_dereference(vq->private_data);
> > > +
> > 
> > I tend not to add empty lines if line below it is already short.
> 
>         This leaves no blank line between the declarations and the start
> of code. It's habit for me-- not sure of kernel coding standards address
> that or not, but I don't think I've seen it anywhere else.
> 
> > 
> > >         if (!sock)
> > >                 return;
> > > 
> > > @@ -166,11 +147,11 @@
> > >                 /* Skip header. TODO: support TSO. */
> > >                 msg.msg_iovlen = out;
> > >                 head.iov_len = len = iov_length(vq->iov, out);
> > > +
> > 
> > I tend not to add empty lines if line below it is a comment.
> 
>         I added this to separate the logical "skip header" block from
> the next, unrelated piece. Not important to me, though.
> 
> > 
> > >                 /* Sanity check */
> > >                 if (!len) {
> > >                         vq_err(vq, "Unexpected header len for TX: "
> > > -                              "%zd expected %zd\n",
> > > -                              len, vq->guest_hlen);
> > > +                              "%zd expected %zd\n", len, 
> vq->guest_hlen);
> > >                         break;
> > >                 }
> > >                 /* TODO: Check specific error and bomb out unless 
> ENOBUFS? 
> > > */
> 
> 
> > >                 /* TODO: Should check and handle checksum. */
> > > +               if (vhost_has_feature(&net->dev, 
> VIRTIO_NET_F_MRG_RXBUF)) 
> > > {
> > > +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > > +                               (struct virtio_net_hdr_mrg_rxbuf *)
> > > +                               vq->iov[0].iov_base;
> > > +                       /* add num_bufs */
> > > +                       vq->iov[0].iov_len = vq->guest_hlen;
> > > +                       vhdr->num_buffers = headcount;
> > 
> > I don't understand this. iov_base is a userspace pointer, isn't it.
> > How can you assign values to it like that?
> > Rusty also commented earlier that it's not a good idea to assume
> > specific layout, such as first chunk being large enough to
> > include virtio_net_hdr_mrg_rxbuf.
> > 
> > I think we need to use memcpy to/from iovec etc.
> 
>         I guess you mean put_user() or copy_to_user(); yes, I suppose
> it could be paged since we read it.
>         The code doesn't assume that it'll fit so much as arranged for
> it to fit. We allocate guest_hlen bytes in the buffer, but set the
> iovec to the (smaller) sock_hlen; do the read, then this code adds
> back the 2 bytes in the middle that we didn't read into (where
> num_buffers goes). But the allocator does require that guest_hlen
> will fit in a single buffer (and reports error if it doesn't). The
> alternative is significantly more complicated,

I'm not sure why. Can't we just call memcpy_from_iovec
and then read the structure as usual?

> and only fails if
> the guest doesn't give us at least the buffer size the guest header
> requires (a truly lame guest). I'm not sure it's worth a lot of
> complexity in vhost to support the guest giving us <12 byte buffers;
> those guests don't exist now and maybe they never should?
> 
> 
> > >  /* This actually signals the guest, using eventfd. */
> > >  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > >  {
> > >         __u16 flags = 0;
> > > +
> > 
> > I tend not to add empty lines if a line above it is already short.
> 
>         Again, separating declarations from code-- never seen different
> in any other kernel code.
> 
> > 
> > >         if (get_user(flags, &vq->avail->flags)) {
> > >                 vq_err(vq, "Failed to get flags");
> > >                 return;
> > > @@ -1125,7 +1140,7 @@
> > > 
> > >         /* If they don't want an interrupt, don't signal, unless 
> empty. */
> > >         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > > -           (vq->avail_idx != vq->last_avail_idx ||
> > > +           (vhost_available(vq) > vq->maxheadcount ||
> > 
> > I don't understand this change. It seems to make
> > code not match the comments.
> 
>         It redefines "empty". Without mergeable buffers, we can empty
> the ring down to nothing before we require notification. With
> mergeable buffers, if the packet requires, say, 3 buffers, and we
> have only 2 left, we are empty and require notification and new
> buffers to read anything. In both cases, we notify when we can't
> read another packet without more buffers.

I don't really see how you can find this out from just the number of
heads. A head can address buffer of any size. Need to think
about this code some more.

>         I can think about changing the comment to reflect this more
> clearly.

Also, this is all for rx only, so names should reflect this I think.

> > 
> > >              !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
> > >                 return;
> > > 
> > > diff -ruN net-next-p2/drivers/vhost/vhost.h 
> > > net-next-p3/drivers/vhost/vhost.h
> > > --- net-next-p2/drivers/vhost/vhost.h   2010-03-02 13:02:03.000000000 
> > > -0800
> > > +++ net-next-p3/drivers/vhost/vhost.h   2010-03-02 14:29:44.000000000 
> > > -0800
> > > @@ -85,6 +85,7 @@
> > >         struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr 
> */
> > >         struct iovec heads[VHOST_NET_MAX_SG];
> > >         size_t guest_hlen, sock_hlen;
> > > +       int maxheadcount;
> > 
> > I don't completely understand what does this field does.
> > It seems to be only set on rx? Maybe name should reflect this?
> 
>         This is a way for me to dynamically guess how many heads I need 
> for a
> max-sized packet for whatever the MTU/GRO settings are without waiting to
> detect the need for more buffers until a read fails. Without mergeable 
> buffers,
> we can always fit a max-sized packet in 1 head, but with, we need more 
> than
> one head to do the read.
> 
> I didn't want to hard-code 64K (which it usually is, but not always), and
> I didn't want to wait until a read fails every time the ring is near full.
> I played with re-enabling notify on 1/4 available or some such, but that 
> delays
> reads unnecessarily, so I came up with this method: use maxheadcount to 
> track
> the biggest packet we've ever seen and always make sure we have at least 
> that
> many available for the next read. If it increases, we may fail the read, 
> which'll
> notify, but this allows us to notify before we try and fail in normal 
> operation,
> while still not doing a notify on every read.
> 
>                                                                 +-DLS


Hmm, yes. One of the horrors of the mergeable buffer hack.

Not sure all this is worth the complexity though: I don't think this
covers all cases whether the size would be < 64K, anyway: you don't get
notified when user disables GRO on physical NIC, for example.  So maybe
just go ahead with hard-coding 64K.

In any case, number of heads does not necessarily tell us much either,
does it?

Would interrupt when we actually don't have room on RX work better? I'll
think about it some more.

-- 
MST


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-03-08  8:07     ` Michael S. Tsirkin
@ 2010-03-31  1:23       ` David Stevens
  2010-03-31 12:02         ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: David Stevens @ 2010-03-31  1:23 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, rusty, virtualization

[-- Attachment #1: Type: text/plain, Size: 22372 bytes --]

This patch adds support for the Mergeable Receive Buffers feature to
vhost_net.

Changes:
        1) generalize descriptor allocation functions to allow multiple
                descriptors per packet
        2) add socket "peek" to know datalen at buffer allocation time
        3) change notification to enable a multi-buffer max packet, rather
                than the single-buffer run until completely empty

Changes from previous revision:
1) incorporate review comments from Michael Tsirkin
2) assume use of TUNSETVNETHDRSZ ioctl by qemu, which simplifies vnet 
header
        processing
3) fixed notification code to only affect the receive side

Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>

[in-line for review, attached for applying w/o whitespace mangling]

diff -ruNp net-next-p0/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
--- net-next-p0/drivers/vhost/net.c     2010-03-22 12:04:38.000000000 
-0700
+++ net-next-p3/drivers/vhost/net.c     2010-03-30 12:50:57.000000000 
-0700
@@ -54,26 +54,6 @@ struct vhost_net {
        enum vhost_net_poll_state tx_poll_state;
 };
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-                         size_t len, int iov_count)
-{
-       int seg = 0;
-       size_t size;
-       while (len && seg < iov_count) {
-               size = min(from->iov_len, len);
-               to->iov_base = from->iov_base;
-               to->iov_len = size;
-               from->iov_len -= size;
-               from->iov_base += size;
-               len -= size;
-               ++from;
-               ++to;
-               ++seg;
-       }
-       return seg;
-}
-
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
 {
@@ -97,7 +77,8 @@ static void tx_poll_start(struct vhost_n
 static void handle_tx(struct vhost_net *net)
 {
        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
-       unsigned head, out, in, s;
+       unsigned out, in;
+       struct iovec head;
        struct msghdr msg = {
                .msg_name = NULL,
                .msg_namelen = 0,
@@ -108,8 +89,8 @@ static void handle_tx(struct vhost_net *
        };
        size_t len, total_len = 0;
        int err, wmem;
-       size_t hdr_size;
        struct socket *sock = rcu_dereference(vq->private_data);
+
        if (!sock)
                return;
 
@@ -127,22 +108,19 @@ static void handle_tx(struct vhost_net *
 
        if (wmem < sock->sk->sk_sndbuf / 2)
                tx_poll_stop(net);
-       hdr_size = vq->hdr_size;
 
        for (;;) {
-               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-                                        ARRAY_SIZE(vq->iov),
-                                        &out, &in,
-                                        NULL, NULL);
+               head.iov_base = (void *)vhost_get_vq_desc(&net->dev, vq,
+                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, 
NULL);
                /* Nothing new?  Wait for eventfd to tell us they 
refilled. */
-               if (head == vq->num) {
+               if (head.iov_base == (void *)vq->num) {
                        wmem = atomic_read(&sock->sk->sk_wmem_alloc);
                        if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
                                tx_poll_start(net, sock);
                                set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
                                break;
                        }
-                       if (unlikely(vhost_enable_notify(vq))) {
+                       if (unlikely(vhost_enable_notify(vq, 0))) {
                                vhost_disable_notify(vq);
                                continue;
                        }
@@ -154,27 +132,30 @@ static void handle_tx(struct vhost_net *
                        break;
                }
                /* Skip header. TODO: support TSO. */
-               s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
                msg.msg_iovlen = out;
-               len = iov_length(vq->iov, out);
+               head.iov_len = len = iov_length(vq->iov, out);
+
                /* Sanity check */
                if (!len) {
-                       vq_err(vq, "Unexpected header len for TX: "
-                              "%zd expected %zd\n",
-                              iov_length(vq->hdr, s), hdr_size);
+                       vq_err(vq, "Unexpected buffer len for TX: %zd ", 
len);
                        break;
                }
-               /* TODO: Check specific error and bomb out unless ENOBUFS? 
*/
                err = sock->ops->sendmsg(NULL, sock, &msg, len);
                if (unlikely(err < 0)) {
-                       vhost_discard_vq_desc(vq);
-                       tx_poll_start(net, sock);
+                       if (err == -EAGAIN) {
+                               vhost_discard_vq_desc(vq, 1);
+                               tx_poll_start(net, sock);
+                       } else {
+                               vq_err(vq, "sendmsg: errno %d\n", -err);
+                               /* drop packet; do not discard/resend */
+                               vhost_add_used_and_signal(&net->dev, vq, 
&head,
+                                                         1, 0);
+                       }
                        break;
                }
                if (err != len)
-                       pr_err("Truncated TX packet: "
-                              " len %d != %zd\n", err, len);
-               vhost_add_used_and_signal(&net->dev, vq, head, 0);
+                       pr_err("Truncated TX packet: len %d != %zd\n", 
err, len);
+               vhost_add_used_and_signal(&net->dev, vq, &head, 1, 0);
                total_len += len;
                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
                        vhost_poll_queue(&vq->poll);
@@ -186,12 +167,25 @@ static void handle_tx(struct vhost_net *
        unuse_mm(net->dev.mm);
 }
 
+static int vhost_head_len(struct sock *sk)
+{
+       struct sk_buff *head;
+       int len = 0;
+
+       lock_sock(sk);
+       head = skb_peek(&sk->sk_receive_queue);
+       if (head)
+               len = head->len;
+       release_sock(sk);
+       return len;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-       unsigned head, out, in, log, s;
+       unsigned in, log;
        struct vhost_log *vq_log;
        struct msghdr msg = {
                .msg_name = NULL,
@@ -202,34 +196,25 @@ static void handle_rx(struct vhost_net *
                .msg_flags = MSG_DONTWAIT,
        };
 
-       struct virtio_net_hdr hdr = {
-               .flags = 0,
-               .gso_type = VIRTIO_NET_HDR_GSO_NONE
-       };
-
        size_t len, total_len = 0;
-       int err;
-       size_t hdr_size;
+       int err, headcount, datalen;
        struct socket *sock = rcu_dereference(vq->private_data);
+
        if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
                return;
 
        use_mm(net->dev.mm);
        mutex_lock(&vq->mutex);
        vhost_disable_notify(vq);
-       hdr_size = vq->hdr_size;
 
        vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
                vq->log : NULL;
 
-       for (;;) {
-               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-                                        ARRAY_SIZE(vq->iov),
-                                        &out, &in,
-                                        vq_log, &log);
+       while ((datalen = vhost_head_len(sock->sk))) {
+               headcount = vhost_get_heads(vq, datalen, &in, vq_log, 
&log);
                /* OK, now we need to know about added descriptors. */
-               if (head == vq->num) {
-                       if (unlikely(vhost_enable_notify(vq))) {
+               if (!headcount) {
+                       if (unlikely(vhost_enable_notify(vq, 1))) {
                                /* They have slipped one in as we were
                                 * doing that: check again. */
                                vhost_disable_notify(vq);
@@ -240,46 +225,42 @@ static void handle_rx(struct vhost_net *
                        break;
                }
                /* We don't need to be notified again. */
-               if (out) {
-                       vq_err(vq, "Unexpected descriptor format for RX: "
-                              "out %d, int %d\n",
-                              out, in);
-                       break;
-               }
-               /* Skip header. TODO: support TSO/mergeable rx buffers. */
-               s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+               if (vq->rxmaxheadcount < headcount)
+                       vq->rxmaxheadcount = headcount;
+               /* Skip header. TODO: support TSO. */
                msg.msg_iovlen = in;
                len = iov_length(vq->iov, in);
                /* Sanity check */
                if (!len) {
-                       vq_err(vq, "Unexpected header len for RX: "
-                              "%zd expected %zd\n",
-                              iov_length(vq->hdr, s), hdr_size);
+                       vq_err(vq, "Unexpected buffer len for RX: %zd\n", 
len);
                        break;
                }
                err = sock->ops->recvmsg(NULL, sock, &msg,
                                         len, MSG_DONTWAIT | MSG_TRUNC);
-               /* TODO: Check specific error and bomb out unless EAGAIN? 
*/
                if (err < 0) {
-                       vhost_discard_vq_desc(vq);
+                       vhost_discard_vq_desc(vq, headcount);
                        break;
                }
                /* TODO: Should check and handle checksum. */
+               if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) 
{
+                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
+                               (struct virtio_net_hdr_mrg_rxbuf *)
+                               vq->iov[0].iov_base;
+                       /* add num_bufs */
+                       if (put_user(headcount, &vhdr->num_buffers)) {
+                               vq_err(vq, "Failed to write num_buffers");
+                               vhost_discard_vq_desc(vq, headcount);
+                               break;
+                       }
+               }
                if (err > len) {
                        pr_err("Discarded truncated rx packet: "
                               " len %d > %zd\n", err, len);
-                       vhost_discard_vq_desc(vq);
+                       vhost_discard_vq_desc(vq, headcount);
                        continue;
                }
                len = err;
-               err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, 
hdr_size);
-               if (err) {
-                       vq_err(vq, "Unable to write vnet_hdr at addr %p: 
%d\n",
-                              vq->iov->iov_base, err);
-                       break;
-               }
-               len += hdr_size;
-               vhost_add_used_and_signal(&net->dev, vq, head, len);
+ vhost_add_used_and_signal(&net->dev,vq,vq->heads,headcount,1);
                if (unlikely(vq_log))
                        vhost_log_write(vq, vq_log, log, len);
                total_len += len;
@@ -560,9 +541,6 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-       size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-               sizeof(struct virtio_net_hdr) : 0;
-       int i;
        mutex_lock(&n->dev.mutex);
        if ((features & (1 << VHOST_F_LOG_ALL)) &&
            !vhost_log_access_ok(&n->dev)) {
@@ -571,11 +549,6 @@ static int vhost_net_set_features(struct
        }
        n->dev.acked_features = features;
        smp_wmb();
-       for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
-               mutex_lock(&n->vqs[i].mutex);
-               n->vqs[i].hdr_size = hdr_size;
-               mutex_unlock(&n->vqs[i].mutex);
-       }
        vhost_net_flush(n);
        mutex_unlock(&n->dev.mutex);
        return 0;
diff -ruNp net-next-p0/drivers/vhost/vhost.c 
net-next-p3/drivers/vhost/vhost.c
--- net-next-p0/drivers/vhost/vhost.c   2010-03-22 12:04:38.000000000 
-0700
+++ net-next-p3/drivers/vhost/vhost.c   2010-03-29 20:12:42.000000000 
-0700
@@ -113,7 +113,7 @@ static void vhost_vq_reset(struct vhost_
        vq->used_flags = 0;
        vq->log_used = false;
        vq->log_addr = -1ull;
-       vq->hdr_size = 0;
+       vq->rxmaxheadcount = 0;
        vq->private_data = NULL;
        vq->log_base = NULL;
        vq->error_ctx = NULL;
@@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
                vq->last_avail_idx = s.num;
                /* Forget the cached index value. */
                vq->avail_idx = vq->last_avail_idx;
+               vq->rxmaxheadcount = 0;
                break;
        case VHOST_GET_VRING_BASE:
                s.index = idx;
@@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
        return 0;
 }
 
+/* This is a multi-head version of vhost_get_vq_desc
+ * @vq         - the relevant virtqueue
+ * datalen     - data length we'll be reading
+ * @iovcount   - returned count of io vectors we fill
+ * @log                - vhost log
+ * @log_num    - log offset
+ */
+unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int 
*iovcount,
+       struct vhost_log *log, unsigned int *log_num)
+{
+       struct iovec *heads = vq->heads;
+       int out, in = 0;
+       int seg = 0;            /* iov index */
+       int hc = 0;             /* head count */
+
+       while (datalen > 0) {
+               if (hc >= VHOST_NET_MAX_SG) {
+                       vhost_discard_vq_desc(vq, hc);
+                       return 0;
+               }
+               heads[hc].iov_base = (void *)vhost_get_vq_desc(vq->dev, 
vq,
+                       vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, &in, 
log,
+                       log_num);
+               if (heads[hc].iov_base == (void *)vq->num) {
+                       vhost_discard_vq_desc(vq, hc);
+                       return 0;
+               }
+               if (out || in <= 0) {
+                       vq_err(vq, "unexpected descriptor format for RX: "
+                               "out %d, in %d\n", out, in);
+                       vhost_discard_vq_desc(vq, hc);
+                       return 0;
+               }
+               heads[hc].iov_len = iov_length(vq->iov+seg, in);
+               datalen -= heads[hc].iov_len;
+               hc++;
+               seg += in;
+       }
+       *iovcount = seg;
+       return hc;
+}
+
 /* This looks in the virtqueue and for the first available buffer, and 
converts
  * it to an iovec for convenient access.  Since descriptors consist of 
some
  * number of output then some number of input descriptors, it's actually 
two
@@ -981,31 +1024,36 @@ unsigned vhost_get_vq_desc(struct vhost_
 }
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
-       vq->last_avail_idx--;
+       vq->last_avail_idx -= n;
 }
 
 /* After we've used one of their buffers, we tell them about it.  We'll 
then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int 
len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, int 
count)
 {
-       struct vring_used_elem *used;
+       struct vring_used_elem *used = 0;
+       int i;
 
-       /* The virtqueue contains a ring of used buffers.  Get a pointer 
to the
-        * next entry in that used ring. */
-       used = &vq->used->ring[vq->last_used_idx % vq->num];
-       if (put_user(head, &used->id)) {
-               vq_err(vq, "Failed to write used id");
-               return -EFAULT;
-       }
-       if (put_user(len, &used->len)) {
-               vq_err(vq, "Failed to write used len");
-               return -EFAULT;
+       if (count <= 0)
+               return -EINVAL;
+
+       for (i = 0; i < count; ++i) {
+               used = &vq->used->ring[vq->last_used_idx % vq->num];
+               if (put_user((unsigned)heads[i].iov_base, &used->id)) {
+                       vq_err(vq, "Failed to write used id");
+                       return -EFAULT;
+               }
+               if (put_user(heads[i].iov_len, &used->len)) {
+                       vq_err(vq, "Failed to write used len");
+                       return -EFAULT;
+               }
+               vq->last_used_idx++;
        }
        /* Make sure buffer is written before we update index. */
        smp_wmb();
-       if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
+       if (put_user(vq->last_used_idx, &vq->used->idx)) {
                vq_err(vq, "Failed to increment used idx");
                return -EFAULT;
        }
@@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
                if (vq->log_ctx)
                        eventfd_signal(vq->log_ctx, 1);
        }
-       vq->last_used_idx++;
        return 0;
 }
 
+int vhost_available(struct vhost_virtqueue *vq)
+{
+       int avail;
+
+       if (!vq->rxmaxheadcount)        /* haven't got any yet */
+               return 1;
+       avail = vq->avail_idx - vq->last_avail_idx;
+       if (avail < 0)
+               avail += 0x10000; /* wrapped */
+       return avail;
+}
+
 /* This actually signals the guest, using eventfd. */
-void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, bool 
recvr)
 {
        __u16 flags = 0;
+
        if (get_user(flags, &vq->avail->flags)) {
                vq_err(vq, "Failed to get flags");
                return;
        }
 
-       /* If they don't want an interrupt, don't signal, unless empty. */
+       /* If they don't want an interrupt, don't signal, unless
+        * empty or receiver can't get a max-sized packet. */
        if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
-           (vq->avail_idx != vq->last_avail_idx ||
+           (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||
             !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
                return;
 
@@ -1050,14 +1111,14 @@ void vhost_signal(struct vhost_dev *dev,
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
                               struct vhost_virtqueue *vq,
-                              unsigned int head, int len)
+                              struct iovec *heads, int count, bool recvr)
 {
-       vhost_add_used(vq, head, len);
-       vhost_signal(dev, vq);
+       vhost_add_used(vq, heads, count);
+       vhost_signal(dev, vq, recvr);
 }
 
 /* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_virtqueue *vq)
+bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
 {
        u16 avail_idx;
        int r;
@@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
                return false;
        }
 
+       if (recvr && vq->rxmaxheadcount)
+               return (avail_idx - vq->last_avail_idx) >= 
vq->rxmaxheadcount;
        return avail_idx != vq->last_avail_idx;
 }
 
diff -ruNp net-next-p0/drivers/vhost/vhost.h 
net-next-p3/drivers/vhost/vhost.h
--- net-next-p0/drivers/vhost/vhost.h   2010-03-22 12:04:38.000000000 
-0700
+++ net-next-p3/drivers/vhost/vhost.h   2010-03-29 20:07:17.000000000 
-0700
@@ -82,9 +82,9 @@ struct vhost_virtqueue {
        u64 log_addr;
 
        struct iovec indirect[VHOST_NET_MAX_SG];
-       struct iovec iov[VHOST_NET_MAX_SG];
-       struct iovec hdr[VHOST_NET_MAX_SG];
-       size_t hdr_size;
+       struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
+       struct iovec heads[VHOST_NET_MAX_SG];
+       int rxmaxheadcount;
        /* We use a kind of RCU to access private pointer.
         * All readers access it from workqueue, which makes it possible 
to
         * flush the workqueue instead of synchronize_rcu. Therefore 
readers do
@@ -120,18 +120,20 @@ long vhost_dev_ioctl(struct vhost_dev *,
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
+unsigned vhost_get_heads(struct vhost_virtqueue *, int datalen, int 
*iovcount,
+                       struct vhost_log *log, unsigned int *log_num);
 unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
                           struct iovec iov[], unsigned int iov_count,
                           unsigned int *out_num, unsigned int *in_num,
                           struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *);
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int);
 
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+int vhost_add_used(struct vhost_virtqueue *, struct iovec *heads, int 
count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue 
*,
-                              unsigned int head, int len);
+                              struct iovec *heads, int count, bool 
recvr);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *, bool);
 void vhost_disable_notify(struct vhost_virtqueue *);
-bool vhost_enable_notify(struct vhost_virtqueue *);
+bool vhost_enable_notify(struct vhost_virtqueue *, bool);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
                    unsigned int log_num, u64 len);
@@ -149,7 +151,8 @@ enum {
        VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
                         (1 << VIRTIO_RING_F_INDIRECT_DESC) |
                         (1 << VHOST_F_LOG_ALL) |
-                        (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+                        (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+                        (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)


[-- Attachment #2: MRXB-3.patch --]
[-- Type: application/octet-stream, Size: 16655 bytes --]

diff -ruNp net-next-p0/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
--- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
+++ net-next-p3/drivers/vhost/net.c	2010-03-30 12:50:57.000000000 -0700
@@ -54,26 +54,6 @@ struct vhost_net {
 	enum vhost_net_poll_state tx_poll_state;
 };
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-			  size_t len, int iov_count)
-{
-	int seg = 0;
-	size_t size;
-	while (len && seg < iov_count) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		from->iov_len -= size;
-		from->iov_base += size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-	return seg;
-}
-
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
 {
@@ -97,7 +77,8 @@ static void tx_poll_start(struct vhost_n
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
-	unsigned head, out, in, s;
+	unsigned out, in;
+	struct iovec head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -108,8 +89,8 @@ static void handle_tx(struct vhost_net *
 	};
 	size_t len, total_len = 0;
 	int err, wmem;
-	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
+
 	if (!sock)
 		return;
 
@@ -127,22 +108,19 @@ static void handle_tx(struct vhost_net *
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
 
 	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head.iov_base = (void *)vhost_get_vq_desc(&net->dev, vq,
+			vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (head.iov_base == (void *)vq->num) {
 			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
 			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
 				tx_poll_start(net, sock);
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
-			if (unlikely(vhost_enable_notify(vq))) {
+			if (unlikely(vhost_enable_notify(vq, 0))) {
 				vhost_disable_notify(vq);
 				continue;
 			}
@@ -154,27 +132,30 @@ static void handle_tx(struct vhost_net *
 			break;
 		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
-		len = iov_length(vq->iov, out);
+		head.iov_len = len = iov_length(vq->iov, out);
+
 		/* Sanity check */
 		if (!len) {
-			vq_err(vq, "Unexpected header len for TX: "
-			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			vq_err(vq, "Unexpected buffer len for TX: %zd ", len);
 			break;
 		}
-		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq);
-			tx_poll_start(net, sock);
+			if (err == -EAGAIN) {
+				vhost_discard_vq_desc(vq, 1);
+				tx_poll_start(net, sock);
+			} else {
+				vq_err(vq, "sendmsg: errno %d\n", -err);
+				/* drop packet; do not discard/resend */
+				vhost_add_used_and_signal(&net->dev, vq, &head,
+							  1, 0);
+			}
 			break;
 		}
 		if (err != len)
-			pr_err("Truncated TX packet: "
-			       " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+			pr_err("Truncated TX packet: len %d != %zd\n", err, len);
+		vhost_add_used_and_signal(&net->dev, vq, &head, 1, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
@@ -186,12 +167,25 @@ static void handle_tx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }
 
+static int vhost_head_len(struct sock *sk)
+{
+	struct sk_buff *head;
+	int len = 0;
+
+	lock_sock(sk);
+	head = skb_peek(&sk->sk_receive_queue);
+	if (head)
+		len = head->len;
+	release_sock(sk);
+	return len;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned head, out, in, log, s;
+	unsigned in, log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -202,34 +196,25 @@ static void handle_rx(struct vhost_net *
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
-	};
-
 	size_t len, total_len = 0;
-	int err;
-	size_t hdr_size;
+	int err, headcount, datalen;
 	struct socket *sock = rcu_dereference(vq->private_data);
+
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
 
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 vq_log, &log);
+	while ((datalen = vhost_head_len(sock->sk))) {
+		headcount = vhost_get_heads(vq, datalen, &in, vq_log, &log);
 		/* OK, now we need to know about added descriptors. */
-		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(vq))) {
+		if (!headcount) {
+			if (unlikely(vhost_enable_notify(vq, 1))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
 				vhost_disable_notify(vq);
@@ -240,46 +225,42 @@ static void handle_rx(struct vhost_net *
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (out) {
-			vq_err(vq, "Unexpected descriptor format for RX: "
-			       "out %d, int %d\n",
-			       out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+		if (vq->rxmaxheadcount < headcount)
+			vq->rxmaxheadcount = headcount;
+		/* Skip header. TODO: support TSO. */
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
 		/* Sanity check */
 		if (!len) {
-			vq_err(vq, "Unexpected header len for RX: "
-			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			vq_err(vq, "Unexpected buffer len for RX: %zd\n", len);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
-		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_vq_desc(vq, headcount);
 			break;
 		}
 		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
+			struct virtio_net_hdr_mrg_rxbuf *vhdr =
+				(struct virtio_net_hdr_mrg_rxbuf *)
+				vq->iov[0].iov_base;
+			/* add num_bufs */
+			if (put_user(headcount, &vhdr->num_buffers)) {
+				vq_err(vq, "Failed to write num_buffers");
+				vhost_discard_vq_desc(vq, headcount);
+				break;
+			}
+		}
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
 			       " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq);
+			vhost_discard_vq_desc(vq, headcount);
 			continue;
 		}
 		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
-		if (err) {
-			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
-			       vq->iov->iov_base, err);
-			break;
-		}
-		len += hdr_size;
-		vhost_add_used_and_signal(&net->dev, vq, head, len);
+		vhost_add_used_and_signal(&net->dev,vq,vq->heads,headcount,1);
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, len);
 		total_len += len;
@@ -560,9 +541,6 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
-	int i;
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
 	    !vhost_log_access_ok(&n->dev)) {
@@ -571,11 +549,6 @@ static int vhost_net_set_features(struct
 	}
 	n->dev.acked_features = features;
 	smp_wmb();
-	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
-		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
-		mutex_unlock(&n->vqs[i].mutex);
-	}
 	vhost_net_flush(n);
 	mutex_unlock(&n->dev.mutex);
 	return 0;
diff -ruNp net-next-p0/drivers/vhost/vhost.c net-next-p3/drivers/vhost/vhost.c
--- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000 -0700
+++ net-next-p3/drivers/vhost/vhost.c	2010-03-29 20:12:42.000000000 -0700
@@ -113,7 +113,7 @@ static void vhost_vq_reset(struct vhost_
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
-	vq->hdr_size = 0;
+	vq->rxmaxheadcount = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
 		vq->last_avail_idx = s.num;
 		/* Forget the cached index value. */
 		vq->avail_idx = vq->last_avail_idx;
+		vq->rxmaxheadcount = 0;
 		break;
 	case VHOST_GET_VRING_BASE:
 		s.index = idx;
@@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
 	return 0;
 }
 
+/* This is a multi-head version of vhost_get_vq_desc
+ * @vq		- the relevant virtqueue
+ * datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ */
+unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int *iovcount,
+	struct vhost_log *log, unsigned int *log_num)
+{
+	struct iovec *heads = vq->heads;
+	int out, in = 0;
+	int seg = 0;		/* iov index */
+	int hc = 0;		/* head count */
+
+	while (datalen > 0) {
+		if (hc >= VHOST_NET_MAX_SG) {
+			vhost_discard_vq_desc(vq, hc);
+			return 0;
+		}
+		heads[hc].iov_base = (void *)vhost_get_vq_desc(vq->dev, vq,
+			vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, &in, log,
+			log_num);
+		if (heads[hc].iov_base == (void *)vq->num) {
+			vhost_discard_vq_desc(vq, hc);
+			return 0;
+		}
+		if (out || in <= 0) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			vhost_discard_vq_desc(vq, hc);
+			return 0;
+		}
+		heads[hc].iov_len = iov_length(vq->iov+seg, in);
+		datalen -= heads[hc].iov_len;
+		hc++;
+		seg += in;
+	}
+	*iovcount = seg;
+	return hc;
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -981,31 +1024,36 @@ unsigned vhost_get_vq_desc(struct vhost_
 }
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
-	vq->last_avail_idx--;
+	vq->last_avail_idx -= n;
 }
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, int count)
 {
-	struct vring_used_elem *used;
+	struct vring_used_elem *used = 0;
+	int i;
 
-	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
-	 * next entry in that used ring. */
-	used = &vq->used->ring[vq->last_used_idx % vq->num];
-	if (put_user(head, &used->id)) {
-		vq_err(vq, "Failed to write used id");
-		return -EFAULT;
-	}
-	if (put_user(len, &used->len)) {
-		vq_err(vq, "Failed to write used len");
-		return -EFAULT;
+	if (count <= 0)
+		return -EINVAL;
+
+	for (i = 0; i < count; ++i) {
+		used = &vq->used->ring[vq->last_used_idx % vq->num];
+		if (put_user((unsigned)heads[i].iov_base, &used->id)) {
+			vq_err(vq, "Failed to write used id");
+			return -EFAULT;
+		}
+		if (put_user(heads[i].iov_len, &used->len)) {
+			vq_err(vq, "Failed to write used len");
+			return -EFAULT;
+		}
+		vq->last_used_idx++;
 	}
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
-	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
+	if (put_user(vq->last_used_idx, &vq->used->idx)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
@@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
 		if (vq->log_ctx)
 			eventfd_signal(vq->log_ctx, 1);
 	}
-	vq->last_used_idx++;
 	return 0;
 }
 
+int vhost_available(struct vhost_virtqueue *vq)
+{
+	int avail;
+
+	if (!vq->rxmaxheadcount)	/* haven't got any yet */
+		return 1;
+	avail = vq->avail_idx - vq->last_avail_idx;
+	if (avail < 0)
+		avail += 0x10000; /* wrapped */
+	return avail;
+}
+
 /* This actually signals the guest, using eventfd. */
-void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, bool recvr)
 {
 	__u16 flags = 0;
+
 	if (get_user(flags, &vq->avail->flags)) {
 		vq_err(vq, "Failed to get flags");
 		return;
 	}
 
-	/* If they don't want an interrupt, don't signal, unless empty. */
+	/* If they don't want an interrupt, don't signal, unless
+	 * empty or receiver can't get a max-sized packet. */
 	if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
-	    (vq->avail_idx != vq->last_avail_idx ||
+	    (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||
 	     !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
 		return;
 
@@ -1050,14 +1111,14 @@ void vhost_signal(struct vhost_dev *dev,
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct iovec *heads, int count, bool recvr)
 {
-	vhost_add_used(vq, head, len);
-	vhost_signal(dev, vq);
+	vhost_add_used(vq, heads, count);
+	vhost_signal(dev, vq, recvr);
 }
 
 /* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_virtqueue *vq)
+bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
 {
 	u16 avail_idx;
 	int r;
@@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
 		return false;
 	}
 
+	if (recvr && vq->rxmaxheadcount)
+		return (avail_idx - vq->last_avail_idx) >= vq->rxmaxheadcount;
 	return avail_idx != vq->last_avail_idx;
 }
 
diff -ruNp net-next-p0/drivers/vhost/vhost.h net-next-p3/drivers/vhost/vhost.h
--- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000 -0700
+++ net-next-p3/drivers/vhost/vhost.h	2010-03-29 20:07:17.000000000 -0700
@@ -82,9 +82,9 @@ struct vhost_virtqueue {
 	u64 log_addr;
 
 	struct iovec indirect[VHOST_NET_MAX_SG];
-	struct iovec iov[VHOST_NET_MAX_SG];
-	struct iovec hdr[VHOST_NET_MAX_SG];
-	size_t hdr_size;
+	struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
+	struct iovec heads[VHOST_NET_MAX_SG];
+	int rxmaxheadcount;
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
@@ -120,18 +120,20 @@ long vhost_dev_ioctl(struct vhost_dev *,
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
+unsigned vhost_get_heads(struct vhost_virtqueue *, int datalen, int *iovcount,
+			struct vhost_log *log, unsigned int *log_num);
 unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *);
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int);
 
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+int vhost_add_used(struct vhost_virtqueue *, struct iovec *heads, int count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int head, int len);
+			       struct iovec *heads, int count, bool recvr);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *, bool);
 void vhost_disable_notify(struct vhost_virtqueue *);
-bool vhost_enable_notify(struct vhost_virtqueue *);
+bool vhost_enable_notify(struct vhost_virtqueue *, bool);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
@@ -149,7 +151,8 @@ enum {
 	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
 			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
 			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-03-31  1:23       ` [PATCH v2] Add Mergeable RX buffer feature to vhost_net David Stevens
@ 2010-03-31 12:02         ` Michael S. Tsirkin
  2010-03-31 22:04           ` David Stevens
  0 siblings, 1 reply; 10+ messages in thread
From: Michael S. Tsirkin @ 2010-03-31 12:02 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, kvm-owner, netdev, rusty, virtualization

On Tue, Mar 30, 2010 at 07:23:48PM -0600, David Stevens wrote:
> This patch adds support for the Mergeable Receive Buffers feature to
> vhost_net.
> 
> Changes:
>         1) generalize descriptor allocation functions to allow multiple
>                 descriptors per packet
>         2) add socket "peek" to know datalen at buffer allocation time
>         3) change notification to enable a multi-buffer max packet, rather
>                 than the single-buffer run until completely empty
> 
> Changes from previous revision:
> 1) incorporate review comments from Michael Tsirkin
> 2) assume use of TUNSETVNETHDRSZ ioctl by qemu, which simplifies vnet 
> header
>         processing
> 3) fixed notification code to only affect the receive side
> 
> Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>
> 
> [in-line for review, attached for applying w/o whitespace mangling]

attached patch seems to be whiespace damaged as well.
Does the origin pass checkpatch.pl for you?

> diff -ruNp net-next-p0/drivers/vhost/net.c net-next-p3/drivers/vhost/net.c
> --- net-next-p0/drivers/vhost/net.c     2010-03-22 12:04:38.000000000 
> -0700
> +++ net-next-p3/drivers/vhost/net.c     2010-03-30 12:50:57.000000000 
> -0700
> @@ -54,26 +54,6 @@ struct vhost_net {
>         enum vhost_net_poll_state tx_poll_state;
>  };
>  
> -/* Pop first len bytes from iovec. Return number of segments used. */
> -static int move_iovec_hdr(struct iovec *from, struct iovec *to,
> -                         size_t len, int iov_count)
> -{
> -       int seg = 0;
> -       size_t size;
> -       while (len && seg < iov_count) {
> -               size = min(from->iov_len, len);
> -               to->iov_base = from->iov_base;
> -               to->iov_len = size;
> -               from->iov_len -= size;
> -               from->iov_base += size;
> -               len -= size;
> -               ++from;
> -               ++to;
> -               ++seg;
> -       }
> -       return seg;
> -}
> -
>  /* Caller must have TX VQ lock */
>  static void tx_poll_stop(struct vhost_net *net)
>  {
> @@ -97,7 +77,8 @@ static void tx_poll_start(struct vhost_n
>  static void handle_tx(struct vhost_net *net)
>  {
>         struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> -       unsigned head, out, in, s;
> +       unsigned out, in;
> +       struct iovec head;
>         struct msghdr msg = {
>                 .msg_name = NULL,
>                 .msg_namelen = 0,
> @@ -108,8 +89,8 @@ static void handle_tx(struct vhost_net *
>         };
>         size_t len, total_len = 0;
>         int err, wmem;
> -       size_t hdr_size;
>         struct socket *sock = rcu_dereference(vq->private_data);
> +
>         if (!sock)
>                 return;
>  
> @@ -127,22 +108,19 @@ static void handle_tx(struct vhost_net *
>  
>         if (wmem < sock->sk->sk_sndbuf / 2)
>                 tx_poll_stop(net);
> -       hdr_size = vq->hdr_size;
>  
>         for (;;) {
> -               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -                                        ARRAY_SIZE(vq->iov),
> -                                        &out, &in,
> -                                        NULL, NULL);
> +               head.iov_base = (void *)vhost_get_vq_desc(&net->dev, vq,
> +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, 
> NULL);

I this casting confusing.
Is it really expensive to add an array of heads so that
we do not need to cast?


>                 /* Nothing new?  Wait for eventfd to tell us they 
> refilled. */
> -               if (head == vq->num) {
> +               if (head.iov_base == (void *)vq->num) {
>                         wmem = atomic_read(&sock->sk->sk_wmem_alloc);
>                         if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
>                                 tx_poll_start(net, sock);
>                                 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>                                 break;
>                         }
> -                       if (unlikely(vhost_enable_notify(vq))) {
> +                       if (unlikely(vhost_enable_notify(vq, 0))) {
>                                 vhost_disable_notify(vq);
>                                 continue;
>                         }
> @@ -154,27 +132,30 @@ static void handle_tx(struct vhost_net *
>                         break;
>                 }
>                 /* Skip header. TODO: support TSO. */
> -               s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>                 msg.msg_iovlen = out;
> -               len = iov_length(vq->iov, out);
> +               head.iov_len = len = iov_length(vq->iov, out);
> +
>                 /* Sanity check */
>                 if (!len) {
> -                       vq_err(vq, "Unexpected header len for TX: "
> -                              "%zd expected %zd\n",
> -                              iov_length(vq->hdr, s), hdr_size);
> +                       vq_err(vq, "Unexpected buffer len for TX: %zd ", 
> len);
>                         break;
>                 }
> -               /* TODO: Check specific error and bomb out unless ENOBUFS? 
> */
>                 err = sock->ops->sendmsg(NULL, sock, &msg, len);
>                 if (unlikely(err < 0)) {
> -                       vhost_discard_vq_desc(vq);
> -                       tx_poll_start(net, sock);
> +                       if (err == -EAGAIN) {
> +                               vhost_discard_vq_desc(vq, 1);
> +                               tx_poll_start(net, sock);
> +                       } else {
> +                               vq_err(vq, "sendmsg: errno %d\n", -err);
> +                               /* drop packet; do not discard/resend */
> +                               vhost_add_used_and_signal(&net->dev, vq, 
> &head,
> +                                                         1, 0);
> +                       }
>                         break;
>                 }
>                 if (err != len)
> -                       pr_err("Truncated TX packet: "
> -                              " len %d != %zd\n", err, len);
> -               vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +                       pr_err("Truncated TX packet: len %d != %zd\n", 
> err, len);
> +               vhost_add_used_and_signal(&net->dev, vq, &head, 1, 0);
>                 total_len += len;
>                 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>                         vhost_poll_queue(&vq->poll);
> @@ -186,12 +167,25 @@ static void handle_tx(struct vhost_net *
>         unuse_mm(net->dev.mm);
>  }
>  
> +static int vhost_head_len(struct sock *sk)
> +{
> +       struct sk_buff *head;
> +       int len = 0;
> +
> +       lock_sock(sk);
> +       head = skb_peek(&sk->sk_receive_queue);
> +       if (head)
> +               len = head->len;
> +       release_sock(sk);
> +       return len;
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>         struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -       unsigned head, out, in, log, s;
> +       unsigned in, log;
>         struct vhost_log *vq_log;
>         struct msghdr msg = {
>                 .msg_name = NULL,
> @@ -202,34 +196,25 @@ static void handle_rx(struct vhost_net *
>                 .msg_flags = MSG_DONTWAIT,
>         };
>  
> -       struct virtio_net_hdr hdr = {
> -               .flags = 0,
> -               .gso_type = VIRTIO_NET_HDR_GSO_NONE
> -       };
> -
>         size_t len, total_len = 0;
> -       int err;
> -       size_t hdr_size;
> +       int err, headcount, datalen;
>         struct socket *sock = rcu_dereference(vq->private_data);
> +
>         if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>                 return;
>  
>         use_mm(net->dev.mm);
>         mutex_lock(&vq->mutex);
>         vhost_disable_notify(vq);
> -       hdr_size = vq->hdr_size;
>  
>         vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>                 vq->log : NULL;
>  
> -       for (;;) {
> -               head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -                                        ARRAY_SIZE(vq->iov),
> -                                        &out, &in,
> -                                        vq_log, &log);
> +       while ((datalen = vhost_head_len(sock->sk))) {
> +               headcount = vhost_get_heads(vq, datalen, &in, vq_log, 
> &log);
>                 /* OK, now we need to know about added descriptors. */
> -               if (head == vq->num) {
> -                       if (unlikely(vhost_enable_notify(vq))) {
> +               if (!headcount) {
> +                       if (unlikely(vhost_enable_notify(vq, 1))) {
>                                 /* They have slipped one in as we were
>                                  * doing that: check again. */
>                                 vhost_disable_notify(vq);
> @@ -240,46 +225,42 @@ static void handle_rx(struct vhost_net *
>                         break;
>                 }
>                 /* We don't need to be notified again. */
> -               if (out) {
> -                       vq_err(vq, "Unexpected descriptor format for RX: "
> -                              "out %d, int %d\n",
> -                              out, in);
> -                       break;
> -               }
> -               /* Skip header. TODO: support TSO/mergeable rx buffers. */
> -               s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> +               if (vq->rxmaxheadcount < headcount)
> +                       vq->rxmaxheadcount = headcount;

This seems the only place where we set the rxmaxheadcount
value. Maybe it can be moved out of vhost.c to net.c?
If vhost library needs this it can get it as function
parameter.

> +               /* Skip header. TODO: support TSO. */

You seem to have removed the code that skips the header.
Won't this break non-GSO backends such as raw?

>                 msg.msg_iovlen = in;
>                 len = iov_length(vq->iov, in);
>                 /* Sanity check */
>                 if (!len) {
> -                       vq_err(vq, "Unexpected header len for RX: "
> -                              "%zd expected %zd\n",
> -                              iov_length(vq->hdr, s), hdr_size);
> +                       vq_err(vq, "Unexpected buffer len for RX: %zd\n", 
> len);
>                         break;
>                 }
>                 err = sock->ops->recvmsg(NULL, sock, &msg,
>                                          len, MSG_DONTWAIT | MSG_TRUNC);
> -               /* TODO: Check specific error and bomb out unless EAGAIN? 
> */

Do you think it's not a good idea?

>                 if (err < 0) {
> -                       vhost_discard_vq_desc(vq);
> +                       vhost_discard_vq_desc(vq, headcount);
>                         break;
>                 }

I think we should detect and discard truncated messages,
since len might not be reliable if userspace pulls
a packet from under us. Also, if new packet is
shorter than the old one, there's no truncation but
headcount is wrong.

So simplest fix IMO would be to compare err with expected len.
If there's a difference, we hit the race and so
we would discard the packet.


>                 /* TODO: Should check and handle checksum. */
> +               if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) 
> {
> +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> +                               (struct virtio_net_hdr_mrg_rxbuf *)
> +                               vq->iov[0].iov_base;
> +                       /* add num_bufs */
> +                       if (put_user(headcount, &vhdr->num_buffers)) {
> +                               vq_err(vq, "Failed to write num_buffers");
> +                               vhost_discard_vq_desc(vq, headcount);

Let's do memcpy_to_iovecend etc so that we do not assume layout.
This is also why we need move_iovec: sendmsg might modify the iovec.
It would also be nice not to corrupt memory and
get a reasonable error if buffer size
that we get is smaller than expected header size.

> +                               break;
> +                       }
> +               }
>                 if (err > len) {
>                         pr_err("Discarded truncated rx packet: "
>                                " len %d > %zd\n", err, len);
> -                       vhost_discard_vq_desc(vq);
> +                       vhost_discard_vq_desc(vq, headcount);
>                         continue;
>                 }
>                 len = err;
> -               err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, 
> hdr_size);
> -               if (err) {
> -                       vq_err(vq, "Unable to write vnet_hdr at addr %p: 
> %d\n",
> -                              vq->iov->iov_base, err);
> -                       break;
> -               }
> -               len += hdr_size;

This seems to break non-GSO backends as well.

> -               vhost_add_used_and_signal(&net->dev, vq, head, len);
> + vhost_add_used_and_signal(&net->dev,vq,vq->heads,headcount,1);
>                 if (unlikely(vq_log))
>                         vhost_log_write(vq, vq_log, log, len);
>                 total_len += len;
> @@ -560,9 +541,6 @@ done:
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -       size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -               sizeof(struct virtio_net_hdr) : 0;
> -       int i;
>         mutex_lock(&n->dev.mutex);
>         if ((features & (1 << VHOST_F_LOG_ALL)) &&
>             !vhost_log_access_ok(&n->dev)) {
> @@ -571,11 +549,6 @@ static int vhost_net_set_features(struct
>         }
>         n->dev.acked_features = features;
>         smp_wmb();
> -       for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
> -               mutex_lock(&n->vqs[i].mutex);
> -               n->vqs[i].hdr_size = hdr_size;
> -               mutex_unlock(&n->vqs[i].mutex);

I expect the above is a leftover from the previous version
which calculated header size in kernel?

> -       }
>         vhost_net_flush(n);
>         mutex_unlock(&n->dev.mutex);
>         return 0;
> diff -ruNp net-next-p0/drivers/vhost/vhost.c 
> net-next-p3/drivers/vhost/vhost.c
> --- net-next-p0/drivers/vhost/vhost.c   2010-03-22 12:04:38.000000000 
> -0700
> +++ net-next-p3/drivers/vhost/vhost.c   2010-03-29 20:12:42.000000000 
> -0700
> @@ -113,7 +113,7 @@ static void vhost_vq_reset(struct vhost_
>         vq->used_flags = 0;
>         vq->log_used = false;
>         vq->log_addr = -1ull;
> -       vq->hdr_size = 0;
> +       vq->rxmaxheadcount = 0;
>         vq->private_data = NULL;
>         vq->log_base = NULL;
>         vq->error_ctx = NULL;
> @@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
>                 vq->last_avail_idx = s.num;
>                 /* Forget the cached index value. */
>                 vq->avail_idx = vq->last_avail_idx;
> +               vq->rxmaxheadcount = 0;
>                 break;
>         case VHOST_GET_VRING_BASE:
>                 s.index = idx;
> @@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
>         return 0;
>  }
>  
> +/* This is a multi-head version of vhost_get_vq_desc

How about:
version of vhost_get_vq_desc that returns multiple
descriptors

> + * @vq         - the relevant virtqueue
> + * datalen     - data length we'll be reading
> + * @iovcount   - returned count of io vectors we fill
> + * @log                - vhost log
> + * @log_num    - log offset

return value?
Also - why unsigned?

> + */
> +unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int 

Would vhost_get_vq_desc_multiple be a better name?

> *iovcount,
> +       struct vhost_log *log, unsigned int *log_num)
> +{
> +       struct iovec *heads = vq->heads;

I think it's better to pass in heads array than take it from vq->heads.

> +       int out, in = 0;

Why is in initialized here?

> +       int seg = 0;            /* iov index */
> +       int hc = 0;             /* head count */
> +
> +       while (datalen > 0) {

Can't this simply call vhost_get_vq_desc in a loop somehow,
or use a common function that both this and vhost_get_vq_desc call?

> +               if (hc >= VHOST_NET_MAX_SG) {
> +                       vhost_discard_vq_desc(vq, hc);
> +                       return 0;
> +               }
> +               heads[hc].iov_base = (void *)vhost_get_vq_desc(vq->dev, 
> vq,
> +                       vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, &in, 
> log,
> +                       log_num);
> +               if (heads[hc].iov_base == (void *)vq->num) {
> +                       vhost_discard_vq_desc(vq, hc);
> +                       return 0;
> +               }
> +               if (out || in <= 0) {
> +                       vq_err(vq, "unexpected descriptor format for RX: "
> +                               "out %d, in %d\n", out, in);
> +                       vhost_discard_vq_desc(vq, hc);

goto err above might help simplify cleanup.

> +                       return 0;
> +               }
> +               heads[hc].iov_len = iov_length(vq->iov+seg, in);
> +               datalen -= heads[hc].iov_len;
> +               hc++;
> +               seg += in;
> +       }
> +       *iovcount = seg;
> +       return hc;
> +}
> +
>  /* This looks in the virtqueue and for the first available buffer, and 
> converts
>   * it to an iovec for convenient access.  Since descriptors consist of 
> some
>   * number of output then some number of input descriptors, it's actually 
> two
> @@ -981,31 +1024,36 @@ unsigned vhost_get_vq_desc(struct vhost_
>  }
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
>  {
> -       vq->last_avail_idx--;
> +       vq->last_avail_idx -= n;
>  }
>  
>  /* After we've used one of their buffers, we tell them about it.  We'll 
> then
>   * want to notify the guest, using eventfd. */
> -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int 
> len)
> +int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, int 
> count)

count is always 0 for send, right?
I think it is better to have two APIs here as well:
vhost_add_used
vhost_add_used_multiple
we can use functions to avoid code duplication.

>  {
> -       struct vring_used_elem *used;
> +       struct vring_used_elem *used = 0;

why is used initialized here?

> +       int i;
>  
> -       /* The virtqueue contains a ring of used buffers.  Get a pointer 
> to the
> -        * next entry in that used ring. */
> -       used = &vq->used->ring[vq->last_used_idx % vq->num];
> -       if (put_user(head, &used->id)) {
> -               vq_err(vq, "Failed to write used id");
> -               return -EFAULT;
> -       }
> -       if (put_user(len, &used->len)) {
> -               vq_err(vq, "Failed to write used len");
> -               return -EFAULT;
> +       if (count <= 0)
> +               return -EINVAL;
> +
> +       for (i = 0; i < count; ++i) {
> +               used = &vq->used->ring[vq->last_used_idx % vq->num];
> +               if (put_user((unsigned)heads[i].iov_base, &used->id)) {
> +                       vq_err(vq, "Failed to write used id");
> +                       return -EFAULT;
> +               }
> +               if (put_user(heads[i].iov_len, &used->len)) {
> +                       vq_err(vq, "Failed to write used len");
> +                       return -EFAULT;
> +               }
> +               vq->last_used_idx++;

I think we should update last_used_idx on success only,
at the end. Simply use last_used_idx + count
instead of last_used_idx + 1.

>         }
>         /* Make sure buffer is written before we update index. */
>         smp_wmb();
> -       if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> +       if (put_user(vq->last_used_idx, &vq->used->idx)) {
>                 vq_err(vq, "Failed to increment used idx");
>                 return -EFAULT;
>         }
> @@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
>                 if (vq->log_ctx)
>                         eventfd_signal(vq->log_ctx, 1);
>         }
> -       vq->last_used_idx++;
>         return 0;
>  }
>  
> +int vhost_available(struct vhost_virtqueue *vq)

since this function is non-static, please document what it does.

> +{
> +       int avail;
> +
> +       if (!vq->rxmaxheadcount)        /* haven't got any yet */
> +               return 1;

since seems to make net - specific asumptions.
How about moving this check out to net.c?

> +       avail = vq->avail_idx - vq->last_avail_idx;
> +       if (avail < 0)
> +               avail += 0x10000; /* wrapped */

A branch that is likely non-taken. Also, rxmaxheadcount
is really unlikely to get as large as half the ring.
So this just wastes cycles?

> +       return avail;
> +}
> +
>  /* This actually signals the guest, using eventfd. */
> -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, bool 
> recvr)

This one is not elegant. receive is net.c concept, let's not
put it in vhost.c. Pass in headcount if you must.

>  {
>         __u16 flags = 0;
> +
>         if (get_user(flags, &vq->avail->flags)) {
>                 vq_err(vq, "Failed to get flags");
>                 return;
>         }
>  
> -       /* If they don't want an interrupt, don't signal, unless empty. */
> +       /* If they don't want an interrupt, don't signal, unless
> +        * empty or receiver can't get a max-sized packet. */
>         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> -           (vq->avail_idx != vq->last_avail_idx ||
> +           (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||

Is the above really worth the complexity?
Guests can't rely on this kind of fuzzy logic, can they?
Do you see this helping performance at all?

>              !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
>                 return;
>  
> @@ -1050,14 +1111,14 @@ void vhost_signal(struct vhost_dev *dev,
>  /* And here's the combo meal deal.  Supersize me! */
>  void vhost_add_used_and_signal(struct vhost_dev *dev,
>                                struct vhost_virtqueue *vq,
> -                              unsigned int head, int len)
> +                              struct iovec *heads, int count, bool recvr)
>  {
> -       vhost_add_used(vq, head, len);
> -       vhost_signal(dev, vq);
> +       vhost_add_used(vq, heads, count);
> +       vhost_signal(dev, vq, recvr);
>  }
>  
>  /* OK, now we need to know about added descriptors. */
> -bool vhost_enable_notify(struct vhost_virtqueue *vq)
> +bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
>  {
>         u16 avail_idx;
>         int r;
> @@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
>                 return false;
>         }
>  
> +       if (recvr && vq->rxmaxheadcount)
> +               return (avail_idx - vq->last_avail_idx) >= 
> vq->rxmaxheadcount;

The fuzzy logic behind rxmaxheadcount might be a good
optimization, but I am not comfortable using it
for correctness. Maybe vhost_enable_notify should get
the last head so we can redo poll when another one
is added.

>         return avail_idx != vq->last_avail_idx;
>  }
>  
> diff -ruNp net-next-p0/drivers/vhost/vhost.h 
> net-next-p3/drivers/vhost/vhost.h
> --- net-next-p0/drivers/vhost/vhost.h   2010-03-22 12:04:38.000000000 
> -0700
> +++ net-next-p3/drivers/vhost/vhost.h   2010-03-29 20:07:17.000000000 
> -0700
> @@ -82,9 +82,9 @@ struct vhost_virtqueue {
>         u64 log_addr;
>  
>         struct iovec indirect[VHOST_NET_MAX_SG];
> -       struct iovec iov[VHOST_NET_MAX_SG];
> -       struct iovec hdr[VHOST_NET_MAX_SG];
> -       size_t hdr_size;
> +       struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */

VHOST_NET_MAX_SG should already include iovec vnet hdr.

> +       struct iovec heads[VHOST_NET_MAX_SG];
> +       int rxmaxheadcount;
>         /* We use a kind of RCU to access private pointer.
>          * All readers access it from workqueue, which makes it possible 
> to
>          * flush the workqueue instead of synchronize_rcu. Therefore 
> readers do
> @@ -120,18 +120,20 @@ long vhost_dev_ioctl(struct vhost_dev *,
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
>  
> +unsigned vhost_get_heads(struct vhost_virtqueue *, int datalen, int 
> *iovcount,
> +                       struct vhost_log *log, unsigned int *log_num);
>  unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
>                            struct iovec iov[], unsigned int iov_count,
>                            unsigned int *out_num, unsigned int *in_num,
>                            struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> +void vhost_discard_vq_desc(struct vhost_virtqueue *, int);
>  
> -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> +int vhost_add_used(struct vhost_virtqueue *, struct iovec *heads, int 
> count);
>  void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue 
> *,
> -                              unsigned int head, int len);
> +                              struct iovec *heads, int count, bool 
> recvr);
> +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *, bool);
>  void vhost_disable_notify(struct vhost_virtqueue *);
> -bool vhost_enable_notify(struct vhost_virtqueue *);
> +bool vhost_enable_notify(struct vhost_virtqueue *, bool);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>                     unsigned int log_num, u64 len);
> @@ -149,7 +151,8 @@ enum {
>         VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>                          (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>                          (1 << VHOST_F_LOG_ALL) |
> -                        (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +                        (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +                        (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-03-31 12:02         ` Michael S. Tsirkin
@ 2010-03-31 22:04           ` David Stevens
  2010-04-01 10:54             ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: David Stevens @ 2010-03-31 22:04 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, rusty, virtualization

"Michael S. Tsirkin" <mst@redhat.com> wrote on 03/31/2010 05:02:28 AM:
> 
> attached patch seems to be whiespace damaged as well.
> Does the origin pass checkpatch.pl for you?

        Yes, but I probably broke it in the transfer -- will be more
careful with the next revision.



> > +               head.iov_base = (void *)vhost_get_vq_desc(&net->dev, 
vq,
> > +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, 

> > NULL);
> 
> I this casting confusing.
> Is it really expensive to add an array of heads so that
> we do not need to cast?

        It needs the heads and the lengths, which looks a lot
like an iovec. I was trying to resist adding a new
struct XXX { unsigned head; unsigned len; } just for this,
but I could make these parallel arrays, one with head index and
the other with length.

> > +               if (vq->rxmaxheadcount < headcount)
> > +                       vq->rxmaxheadcount = headcount;
> 
> This seems the only place where we set the rxmaxheadcount
> value. Maybe it can be moved out of vhost.c to net.c?
> If vhost library needs this it can get it as function
> parameter.

        I can move it to vhost_get_heads(), sure.
> 
> > +               /* Skip header. TODO: support TSO. */
> 
> You seem to have removed the code that skips the header.
> Won't this break non-GSO backends such as raw?

        From our prior discussion, what I had in mind was that
we'd remove all special-header processing by using the ioctl
you added for TUN and later, an equivalent ioctl for raw (when
supported in qemu-kvm). Qemu would arrange headers with tap
(and later raw) to get what the guest expects, and vhost then
just passes all data as-is between the socket and the ring.
        That not only removes the different-header-len code
for mergeable buffers, but also eliminates making a copy of the
header for every packet as before. Vhost has no need to do
anything with the header, or even know its length. It also
doesn't have to care what the type of the backend is-- raw or
tap.
        I think that simplifies everything, but it does mean that
raw socket support requires a header ioctl for the different
vnethdr sizes if the guest wants a vnethdr with and without
mergeable buffers. Actually, I thought this was your idea and
the point of the TUNSETVNETHDRSZ ioctl, but I guess you had
something else in mind?

> > -               /* TODO: Check specific error and bomb out unless 
EAGAIN? 
> > */
> 
> Do you think it's not a good idea?

        EAGAIN is not possible after the change, because we don't
even enter the loop unless we have an skb on the read queue; the
other cases bomb out, so I figured the comment for future work is
now done. :-)

> 
> >                 if (err < 0) {
> > -                       vhost_discard_vq_desc(vq);
> > +                       vhost_discard_vq_desc(vq, headcount);
> >                         break;
> >                 }
> 
> I think we should detect and discard truncated messages,
> since len might not be reliable if userspace pulls
> a packet from under us. Also, if new packet is
> shorter than the old one, there's no truncation but
> headcount is wrong.
> 
> So simplest fix IMO would be to compare err with expected len.
> If there's a difference, we hit the race and so
> we would discard the packet.

Sure.

> 
> 
> >                 /* TODO: Should check and handle checksum. */
> > +               if (vhost_has_feature(&net->dev, 
VIRTIO_NET_F_MRG_RXBUF)) 
> > {
> > +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +                               (struct virtio_net_hdr_mrg_rxbuf *)
> > +                               vq->iov[0].iov_base;
> > +                       /* add num_bufs */
> > +                       if (put_user(headcount, &vhdr->num_buffers)) {
> > +                               vq_err(vq, "Failed to write 
num_buffers");
> > +                               vhost_discard_vq_desc(vq, headcount);
> 
> Let's do memcpy_to_iovecend etc so that we do not assume layout.
> This is also why we need move_iovec: sendmsg might modify the iovec.
> It would also be nice not to corrupt memory and
> get a reasonable error if buffer size
> that we get is smaller than expected header size.

        I wanted to avoid the header copy here, with the reasonable 
restriction
that the guest gives you a buffer at least big enough for the vnet_hdr. A
length check alone (on iov[0].iov_len) could enforce that without copying
headers back and forth to support silly cases like 8-byte buffers or
num_buffers spanning multiple iovecs, and I think paying the price of the
copy on every packet to support guests that broken isn't worth it.
        So, my preference here would be to add:

        if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
                struct virtio_net_mrg_rxbuf *vhdr...

                if (vq->iov[0].iov_len < sizeof(*vhdr)) {
                        vq_err(vq, "tiny buffers (len %d < %d) are not 
supported",
                                vq->iov[0].iov_len, sizeof(*vhdr));
                        break;
                }
                /* add num_bufs */
                ...

Does that work for you?

> > -               if (err) {
> > -                       vq_err(vq, "Unable to write vnet_hdr at addr 
%p: 
> > %d\n",
> > -                              vq->iov->iov_base, err);
> > -                       break;
> > -               }
> > -               len += hdr_size;
> 
> This seems to break non-GSO backends as well.

I don't have any to test with w/ current qemu-kvm, but again, I thought
your plan was to remove special header processing from vhost and let
the socket end do it via ioctl. Wouldn't a similar ioctl for raw
sockets when supported by qemu do that?


> >         }
> >         n->dev.acked_features = features;
> >         smp_wmb();
> > -       for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
> > -               mutex_lock(&n->vqs[i].mutex);
> > -               n->vqs[i].hdr_size = hdr_size;
> > -               mutex_unlock(&n->vqs[i].mutex);
> 
> I expect the above is a leftover from the previous version
> which calculated header size in kernel?

        Right. With the ioctl, and assuming raw gets a similar
one, we don't need to know anything about the header size in
vhost, except that qemu needs to make sure mergeable rxbufs is
only set when the socket has the larger vnet_hdr that includes
the num_bufs field.


> > @@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
> >                 vq->last_avail_idx = s.num;
> >                 /* Forget the cached index value. */
> >                 vq->avail_idx = vq->last_avail_idx;
> > +               vq->rxmaxheadcount = 0;
> >                 break;
> >         case VHOST_GET_VRING_BASE:
> >                 s.index = idx;
> > @@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
> >         return 0;
> >  }
> > 
> > +/* This is a multi-head version of vhost_get_vq_desc
> 
> How about:
> version of vhost_get_vq_desc that returns multiple
> descriptors

OK.

> 
> > + * @vq         - the relevant virtqueue
> > + * datalen     - data length we'll be reading
> > + * @iovcount   - returned count of io vectors we fill
> > + * @log                - vhost log
> > + * @log_num    - log offset
> 
> return value?
> Also - why unsigned?

        Returned value is the number of heads, which is
0 or greater.
> 
> > + */
> > +unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int 

> 
> Would vhost_get_vq_desc_multiple be a better name?

        26 chars for the function name with indentation and
argument list might not be pretty on the calls. What about
changing vhost_get_desc() and vhost_get_desc_n() [make them
both shorter]? ("_n" indicating >= 1 instead of just one).

> > +       struct vhost_log *log, unsigned int *log_num)
> > +{
> > +       struct iovec *heads = vq->heads;
> 
> I think it's better to pass in heads array than take it from vq->heads.

        I'd actually prefer to go the other way, and remove the
log stuff, for example, since they are accessible from vq which
we have already. Adding heads make it look less similar to the
arg list for vhost_get_vq_desc(), but I'm more interested in
getting the feature than particular arg lists, so your call.

> 
> > +       int out, in = 0;
> 
> Why is in initialized here?

Needed in the previous version, but not here.

> 
> > +       int seg = 0;            /* iov index */
> > +       int hc = 0;             /* head count */
> > +
> > +       while (datalen > 0) {
> 
> Can't this simply call vhost_get_vq_desc in a loop somehow,
> or use a common function that both this and vhost_get_vq_desc call?

It is calling vhost_get_vq_desc() in this loop; that's how it
gets the indiviual heads and iov entries. The rest of it is just
massaging the offsets so vhost_get_vq_desc puts them all in the
right places and tracks the heads and lengths correctly for add_used.

> 
> > +               if (hc >= VHOST_NET_MAX_SG) {
> > +                       vhost_discard_vq_desc(vq, hc);
> > +                       return 0;
> > +               }
> > +               heads[hc].iov_base = (void 
*)vhost_get_vq_desc(vq->dev, 
> > vq,
> > +                       vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, 
&in, 
> > log,
> > +                       log_num);
> > +               if (heads[hc].iov_base == (void *)vq->num) {
> > +                       vhost_discard_vq_desc(vq, hc);
> > +                       return 0;
> > +               }
> > +               if (out || in <= 0) {
> > +                       vq_err(vq, "unexpected descriptor format for 
RX: "
> > +                               "out %d, in %d\n", out, in);
> > +                       vhost_discard_vq_desc(vq, hc);
> 
> goto err above might help simplify cleanup.

I generally would rather see the error cleanup explicitly where the
error detection and message are, for code readability, but I can change
it to a goto here if you think that's clearer.

> > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int 

> > len)
> > +int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, 
int 
> > count)
> 
> count is always 0 for send, right?
> I think it is better to have two APIs here as well:
> vhost_add_used
> vhost_add_used_multiple
> we can use functions to avoid code duplication.
> 
> >  {
> > -       struct vring_used_elem *used;
> > +       struct vring_used_elem *used = 0;
> 
> why is used initialized here?

        This is to remove a compiler warning that "used" may
be used unitialized, for a case that can't happen (count <= 0).


> > +               vq->last_used_idx++;
> 
> I think we should update last_used_idx on success only,
> at the end. Simply use last_used_idx + count
> instead of last_used_idx + 1.

        OK.

> > @@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
> >                 if (vq->log_ctx)
> >                         eventfd_signal(vq->log_ctx, 1);
> >         }
> > -       vq->last_used_idx++;
> >         return 0;
> >  }
> > 
> > +int vhost_available(struct vhost_virtqueue *vq)
> 
> since this function is non-static, please document what it does.

        OK.

> > +{
> > +       int avail;
> > +
> > +       if (!vq->rxmaxheadcount)        /* haven't got any yet */
> > +               return 1;
> 
> since seems to make net - specific asumptions.
> How about moving this check out to net.c?

        I can, but its only user is vhost_signal() in this file.
 
> > +       avail = vq->avail_idx - vq->last_avail_idx;
> > +       if (avail < 0)
> > +               avail += 0x10000; /* wrapped */
> 
> A branch that is likely non-taken. Also, rxmaxheadcount
> is really unlikely to get as large as half the ring.
> So this just wastes cycles?

        The indexes are free-running counters, so if
avail_idx has overflowed but last_avail_idx has not
then the result will be (incorrectly) negative. It
has nothing to do with how many buffers are in use--
consider avail_idx = 2 and last_avail_idx = 65534,
then avail = will be -65532, but we want it to be 4.

> 
> > +       return avail;
> > +}
> > +
> >  /* This actually signals the guest, using eventfd. */
> > -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, 
bool 
> > recvr)
> 
> This one is not elegant. receive is net.c concept, let's not
> put it in vhost.c. Pass in headcount if you must.

        Mainly what I was after here is to not affect signal's
for the transmit side. I thought about splitting it into a
separate function entirely for the transmit, but came up with
this, instead. I can't tell if it's a receiver from the vq,
but I don't want to override NOTIFY_ON_EMPTY or the headcount
check (which has nothing to do with transmitters) for the
transmitter at all.
        The vq doesn't tell you if it is a receiver or not,
so I either need a flag as argument to tell, or I need to
split into a transmit vhost_signal and a receive vhost_signal.
        I can do the available check before calling vhost_signal,
but then I need to remove the NOTIFY check entirely so it'll
still notify for the receiver case when available is not
enough but still not 0.
        So, I think the options are:
1) a flag
2) separate transmit or receive signal functions
3) move NOTIFY_ON_EMPTY out of vhost_signal and
        check that or available (for recvr) before
        calling vhost_signal().

> 
> >  {
> >         __u16 flags = 0;
> > +
> >         if (get_user(flags, &vq->avail->flags)) {
> >                 vq_err(vq, "Failed to get flags");
> >                 return;
> >         }
> > 
> > -       /* If they don't want an interrupt, don't signal, unless 
empty. */
> > +       /* If they don't want an interrupt, don't signal, unless
> > +        * empty or receiver can't get a max-sized packet. */
> >         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > -           (vq->avail_idx != vq->last_avail_idx ||
> > +           (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||
> 
> Is the above really worth the complexity?
> Guests can't rely on this kind of fuzzy logic, can they?
> Do you see this helping performance at all?

        It definitely hurts performance to allocate max-sized buffers
and then free up the excess (the translate_desc is expensive). It is
a heuristic, but it's one that keeps the notifies flowing without
having to fail on a packet to reenable them.
        The notify code has to change because there is no recovery
if we need 5 buffers and have only 3, while notification is
disabled unless we go all the way to 0. We can't honor NOTIFY_ON_EMPTY
in that case because it will never proceed. We can remove the
NOTIFY_ON_EMPTY check in vhost_signal and do it in the caller; then allow
receiver to fail a buffer allocation and signal unconditionally,
but we cannot wait for it to be 0 even if NOTIFY_ON_EMPTY is set--
that's a deadlock.


> >  /* OK, now we need to know about added descriptors. */
> > -bool vhost_enable_notify(struct vhost_virtqueue *vq)
> > +bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
> >  {
> >         u16 avail_idx;
> >         int r;
> > @@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
> >                 return false;
> >         }
> > 
> > +       if (recvr && vq->rxmaxheadcount)
> > +               return (avail_idx - vq->last_avail_idx) >= 
> > vq->rxmaxheadcount;
> 
> The fuzzy logic behind rxmaxheadcount might be a good
> optimization, but I am not comfortable using it
> for correctness. Maybe vhost_enable_notify should get
> the last head so we can redo poll when another one
> is added.

        I tried requiring at least 64K here always, but as
I said before, the cost of doing the translate_desc and
then throwing those away for small packets killed performance.
I also considered using a highwater mark on the ring, but
potentially differing buffer sizes makes that less useful,
and a small ring may only have enough space for 1 max-sized
packet. 
        Maybe we should just remove NOTIFY_ON_EMPTY if we
fail a packet allocation; I can try that out.
        Another alternative I considered was splitting out
the translate_desc stuff in the hopes that we could found
out how many buffers we need in a less-expensive way; then
we could go for max-sized and free the excess without too
much overhead, maybe.
        Anyway, I'll look harder at working around this.

> >         struct iovec indirect[VHOST_NET_MAX_SG];
> > -       struct iovec iov[VHOST_NET_MAX_SG];
> > -       struct iovec hdr[VHOST_NET_MAX_SG];
> > -       size_t hdr_size;
> > +       struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr 
*/
> 
> VHOST_NET_MAX_SG should already include iovec vnet hdr.

        That's actually an artifact from the previous patch. I
was arranging the iovec to have just the header needed by the
guest and this simplified the bounds checking. It can be
removed if we leave header size management to the socket side,
but if vhost is going to be doing vnet_hdr manipulation for
raw sockets, then it'll depend on what that new code needs
to do to manage the header, I suppose.

                                                +-DLS


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-03-31 22:04           ` David Stevens
@ 2010-04-01 10:54             ` Michael S. Tsirkin
  2010-04-01 18:22               ` David Stevens
  0 siblings, 1 reply; 10+ messages in thread
From: Michael S. Tsirkin @ 2010-04-01 10:54 UTC (permalink / raw)
  To: David Stevens; +Cc: netdev, kvm-owner, kvm, virtualization

On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 03/31/2010 05:02:28 AM:
> > 
> > attached patch seems to be whiespace damaged as well.
> > Does the origin pass checkpatch.pl for you?
> 
>         Yes, but I probably broke it in the transfer -- will be more
> careful with the next revision.
> 
> 
> 
> > > +               head.iov_base = (void *)vhost_get_vq_desc(&net->dev, 
> vq,
> > > +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, 
> 
> > > NULL);
> > 
> > I this casting confusing.
> > Is it really expensive to add an array of heads so that
> > we do not need to cast?
> 
>         It needs the heads and the lengths, which looks a lot
> like an iovec. I was trying to resist adding a new
> struct XXX { unsigned head; unsigned len; } just for this,
> but I could make these parallel arrays, one with head index and
> the other with length.
> 
> > > +               if (vq->rxmaxheadcount < headcount)
> > > +                       vq->rxmaxheadcount = headcount;
> > 
> > This seems the only place where we set the rxmaxheadcount
> > value. Maybe it can be moved out of vhost.c to net.c?
> > If vhost library needs this it can get it as function
> > parameter.
> 
>         I can move it to vhost_get_heads(), sure.
> > 
> > > +               /* Skip header. TODO: support TSO. */
> > 
> > You seem to have removed the code that skips the header.
> > Won't this break non-GSO backends such as raw?
> 
>         From our prior discussion, what I had in mind was that
> we'd remove all special-header processing by using the ioctl
> you added for TUN and later, an equivalent ioctl for raw (when
> supported in qemu-kvm). Qemu would arrange headers with tap
> (and later raw) to get what the guest expects, and vhost then
> just passes all data as-is between the socket and the ring.
>         That not only removes the different-header-len code
> for mergeable buffers, but also eliminates making a copy of the
> header for every packet as before.

Yes but please note that in practice if this is what we do,
then vhost header size is 0 and then there is no actual copy.

> Vhost has no need to do
> anything with the header, or even know its length. It also
> doesn't have to care what the type of the backend is-- raw or
> tap.
>         I think that simplifies everything, but it does mean that
> raw socket support requires a header ioctl for the different
> vnethdr sizes if the guest wants a vnethdr with and without
> mergeable buffers. Actually, I thought this was your idea and
> the point of the TUNSETVNETHDRSZ ioctl, but I guess you had
> something else in mind?

Yes. However, we also have an ioctl that sets vhost header size, and it
allows supporting simple backends which do not support an (equivalent
of) TUNSETVNETHDRSZ. We have two of these now: packet and macvtap.
So I thought that unless there are gains in code simplicity and/or
performance we can keep supporting these just as well.

> > > -               /* TODO: Check specific error and bomb out unless 
> EAGAIN? 
> > > */
> > 
> > Do you think it's not a good idea?
> 
>         EAGAIN is not possible after the change, because we don't
> even enter the loop unless we have an skb on the read queue; the
> other cases bomb out, so I figured the comment for future work is
> now done. :-)

Guest could be buggy so we'll get EFAULT.
If skb is taken off the rx queue (as below), we might get EAGAIN.

> > 
> > >                 if (err < 0) {
> > > -                       vhost_discard_vq_desc(vq);
> > > +                       vhost_discard_vq_desc(vq, headcount);
> > >                         break;
> > >                 }
> > 
> > I think we should detect and discard truncated messages,
> > since len might not be reliable if userspace pulls
> > a packet from under us. Also, if new packet is
> > shorter than the old one, there's no truncation but
> > headcount is wrong.
> > 
> > So simplest fix IMO would be to compare err with expected len.
> > If there's a difference, we hit the race and so
> > we would discard the packet.
> 
> Sure.
> 
> > 
> > 
> > >                 /* TODO: Should check and handle checksum. */
> > > +               if (vhost_has_feature(&net->dev, 
> VIRTIO_NET_F_MRG_RXBUF)) 
> > > {
> > > +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > > +                               (struct virtio_net_hdr_mrg_rxbuf *)
> > > +                               vq->iov[0].iov_base;
> > > +                       /* add num_bufs */
> > > +                       if (put_user(headcount, &vhdr->num_buffers)) {
> > > +                               vq_err(vq, "Failed to write 
> num_buffers");
> > > +                               vhost_discard_vq_desc(vq, headcount);
> > 
> > Let's do memcpy_to_iovecend etc so that we do not assume layout.
> > This is also why we need move_iovec: sendmsg might modify the iovec.
> > It would also be nice not to corrupt memory and
> > get a reasonable error if buffer size
> > that we get is smaller than expected header size.
> 
>         I wanted to avoid the header copy here, with the reasonable 
> restriction
> that the guest gives you a buffer at least big enough for the vnet_hdr.
> A
> length check alone (on iov[0].iov_len) could enforce that without copying
> headers back and forth to support silly cases like 8-byte buffers or
> num_buffers spanning multiple iovecs, and I think paying the price of the
> copy on every packet to support guests that broken isn't worth it.

In practice, when the first iovec includes the header what
we will copy is iov[0].iov_len + iov[0].iov_base.

We also still have the problem that sendmsg might modify the iovec,
(for tap it doesn't) so using iov_len after sendmsg isn't legal,
you need to copy it anyway.

>         So, my preference here would be to add:
> 
>         if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
>                 struct virtio_net_mrg_rxbuf *vhdr...
> 
>                 if (vq->iov[0].iov_len < sizeof(*vhdr)) {
>                         vq_err(vq, "tiny buffers (len %d < %d) are not 
> supported",
>                                 vq->iov[0].iov_len, sizeof(*vhdr));
>                         break;
>                 }
>                 /* add num_bufs */
>                 ...
> 
> Does that work for you?

Seems more code, doesn't it? Do you really believe this is a performance gain?
It looks a bit like prepature optimization ...

> > > -               if (err) {
> > > -                       vq_err(vq, "Unable to write vnet_hdr at addr 
> %p: 
> > > %d\n",
> > > -                              vq->iov->iov_base, err);
> > > -                       break;
> > > -               }
> > > -               len += hdr_size;
> > 
> > This seems to break non-GSO backends as well.
> 
> I don't have any to test with w/ current qemu-kvm,

I can dig up the demo, but there's also the macvtap backend. Try using that.

> but again, I thought
> your plan was to remove special header processing from vhost and let
> the socket end do it via ioctl. Wouldn't a similar ioctl for raw
> sockets when supported by qemu do that?

I just don't want to require that backends support GSO/vnet header,
so vhost needs an ability to skip the header.

> > >         }
> > >         n->dev.acked_features = features;
> > >         smp_wmb();
> > > -       for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
> > > -               mutex_lock(&n->vqs[i].mutex);
> > > -               n->vqs[i].hdr_size = hdr_size;
> > > -               mutex_unlock(&n->vqs[i].mutex);
> > 
> > I expect the above is a leftover from the previous version
> > which calculated header size in kernel?
> 
>         Right. With the ioctl, and assuming raw gets a similar
> one, we don't need to know anything about the header size in
> vhost, except that qemu needs to make sure mergeable rxbufs is
> only set when the socket has the larger vnet_hdr that includes
> the num_bufs field.
> 

Note that vhost doesn't really *know* about header, per se,
it just skips the first X bytes in iovec.

> > > @@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
> > >                 vq->last_avail_idx = s.num;
> > >                 /* Forget the cached index value. */
> > >                 vq->avail_idx = vq->last_avail_idx;
> > > +               vq->rxmaxheadcount = 0;
> > >                 break;
> > >         case VHOST_GET_VRING_BASE:
> > >                 s.index = idx;
> > > @@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
> > >         return 0;
> > >  }
> > > 
> > > +/* This is a multi-head version of vhost_get_vq_desc
> > 
> > How about:
> > version of vhost_get_vq_desc that returns multiple
> > descriptors
> 
> OK.
> 
> > 
> > > + * @vq         - the relevant virtqueue
> > > + * datalen     - data length we'll be reading
> > > + * @iovcount   - returned count of io vectors we fill
> > > + * @log                - vhost log
> > > + * @log_num    - log offset
> > 
> > return value?
> > Also - why unsigned?
> 
>         Returned value is the number of heads, which is
> 0 or greater.

The advantage of using int is that you'll be able to return
an error code. Good for debugging.

> > 
> > > + */
> > > +unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int 
> 
> > 
> > Would vhost_get_vq_desc_multiple be a better name?
> 
>         26 chars for the function name with indentation and
> argument list might not be pretty on the calls. What about
> changing vhost_get_desc() and vhost_get_desc_n() [make them
> both shorter]? ("_n" indicating >= 1 instead of just one).

OK.

> > > +       struct vhost_log *log, unsigned int *log_num)
> > > +{
> > > +       struct iovec *heads = vq->heads;
> > 
> > I think it's better to pass in heads array than take it from vq->heads.
> 
>         I'd actually prefer to go the other way, and remove the
> log stuff, for example, since they are accessible from vq which
> we have already.

The main reason is that long term, we might want to
detach the arrays from vq - they are there simply to
avoid sticking too much data on stack - and the
fixed size becomes limiting for vhost block.
The advantage of passing them would be that
vhost won't have to change.

> Adding heads make it look less similar to the
> arg list for vhost_get_vq_desc(), but I'm more interested in
> getting the feature than particular arg lists, so your call.

Well, vhost_get_desc_n is just like vhost_get_desc but returns
multiple heads. In C you can return a single int by value
but many ints must be returned in an array, so I don't
think it is that dissimilar.

> > 
> > > +       int out, in = 0;
> > 
> > Why is in initialized here?
> 
> Needed in the previous version, but not here.
> 
> > 
> > > +       int seg = 0;            /* iov index */
> > > +       int hc = 0;             /* head count */
> > > +
> > > +       while (datalen > 0) {
> > 
> > Can't this simply call vhost_get_vq_desc in a loop somehow,
> > or use a common function that both this and vhost_get_vq_desc call?
> 
> It is calling vhost_get_vq_desc() in this loop; that's how it
> gets the indiviual heads and iov entries. The rest of it is just
> massaging the offsets so vhost_get_vq_desc puts them all in the
> right places and tracks the heads and lengths correctly for add_used.

Sorry, will have to re-read.

> > 
> > > +               if (hc >= VHOST_NET_MAX_SG) {
> > > +                       vhost_discard_vq_desc(vq, hc);
> > > +                       return 0;
> > > +               }
> > > +               heads[hc].iov_base = (void 
> *)vhost_get_vq_desc(vq->dev, 
> > > vq,
> > > +                       vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, 
> &in, 
> > > log,
> > > +                       log_num);
> > > +               if (heads[hc].iov_base == (void *)vq->num) {
> > > +                       vhost_discard_vq_desc(vq, hc);
> > > +                       return 0;
> > > +               }
> > > +               if (out || in <= 0) {
> > > +                       vq_err(vq, "unexpected descriptor format for 
> RX: "
> > > +                               "out %d, in %d\n", out, in);
> > > +                       vhost_discard_vq_desc(vq, hc);
> > 
> > goto err above might help simplify cleanup.
> 
> I generally would rather see the error cleanup explicitly where the
> error detection and message are, for code readability, but I can change
> it to a goto here if you think that's clearer.

This is the idiom I use all over this code.

> > > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int 
> 
> > > len)
> > > +int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, 
> int 
> > > count)
> > 
> > count is always 0 for send, right?
> > I think it is better to have two APIs here as well:
> > vhost_add_used
> > vhost_add_used_multiple
> > we can use functions to avoid code duplication.
> > 
> > >  {
> > > -       struct vring_used_elem *used;
> > > +       struct vring_used_elem *used = 0;
> > 
> > why is used initialized here?
> 
>         This is to remove a compiler warning that "used" may
> be used unitialized, for a case that can't happen (count <= 0).
> 

I think there's some macro in kernel to shut the compiler up.
Or just ignore the warning. Let's not add extra initializers
just to shut up compiler, I think this might hide real bugs.

> > > +               vq->last_used_idx++;
> > 
> > I think we should update last_used_idx on success only,
> > at the end. Simply use last_used_idx + count
> > instead of last_used_idx + 1.
> 
>         OK.
> 
> > > @@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
> > >                 if (vq->log_ctx)
> > >                         eventfd_signal(vq->log_ctx, 1);
> > >         }
> > > -       vq->last_used_idx++;
> > >         return 0;
> > >  }
> > > 
> > > +int vhost_available(struct vhost_virtqueue *vq)
> > 
> > since this function is non-static, please document what it does.
> 
>         OK.
> 
> > > +{
> > > +       int avail;
> > > +
> > > +       if (!vq->rxmaxheadcount)        /* haven't got any yet */
> > > +               return 1;
> > 
> > since seems to make net - specific asumptions.
> > How about moving this check out to net.c?
> 
>         I can, but its only user is vhost_signal() in this file.

So why isn't it static?

> > > +       avail = vq->avail_idx - vq->last_avail_idx;
> > > +       if (avail < 0)
> > > +               avail += 0x10000; /* wrapped */
> > 
> > A branch that is likely non-taken. Also, rxmaxheadcount
> > is really unlikely to get as large as half the ring.
> > So this just wastes cycles?
> 
>         The indexes are free-running counters, so if
> avail_idx has overflowed but last_avail_idx has not
> then the result will be (incorrectly) negative. It
> has nothing to do with how many buffers are in use--
> consider avail_idx = 2 and last_avail_idx = 65534,
> then avail = will be -65532, but we want it to be 4.

Right. Sorry about the noise.

> > 
> > > +       return avail;
> > > +}
> > > +
> > >  /* This actually signals the guest, using eventfd. */
> > > -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, 
> bool 
> > > recvr)
> > 
> > This one is not elegant. receive is net.c concept, let's not
> > put it in vhost.c. Pass in headcount if you must.
> 
>         Mainly what I was after here is to not affect signal's
> for the transmit side. I thought about splitting it into a
> separate function entirely for the transmit, but came up with
> this, instead. I can't tell if it's a receiver from the vq,
> but I don't want to override NOTIFY_ON_EMPTY or the headcount
> check (which has nothing to do with transmitters) for the
> transmitter at all.
>         The vq doesn't tell you if it is a receiver or not,
> so I either need a flag as argument to tell, or I need to
> split into a transmit vhost_signal and a receive vhost_signal.

You see, receive is really a net concept. virtio has in and out,
and a single descriptor can do both.

>         I can do the available check before calling vhost_signal,
> but then I need to remove the NOTIFY check entirely so it'll
> still notify for the receiver case when available is not
> enough but still not 0.

My question is, guests can not rely on the 'available is not
enough but still not 0' to work because it relies on a heuristic
to figure out how much is 'not enough'. So why do we need it at all.

>         So, I think the options are:
> 1) a flag
> 2) separate transmit or receive signal functions
> 3) move NOTIFY_ON_EMPTY out of vhost_signal and
>         check that or available (for recvr) before
>         calling vhost_signal().

There are ordering rules that make 3 impossible.
Let's try to rethink the concept of redefining NOTIFY_ON_EMPTY.
Is it really necessary?

> > 
> > >  {
> > >         __u16 flags = 0;
> > > +
> > >         if (get_user(flags, &vq->avail->flags)) {
> > >                 vq_err(vq, "Failed to get flags");
> > >                 return;
> > >         }
> > > 
> > > -       /* If they don't want an interrupt, don't signal, unless 
> empty. */
> > > +       /* If they don't want an interrupt, don't signal, unless
> > > +        * empty or receiver can't get a max-sized packet. */
> > >         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > > -           (vq->avail_idx != vq->last_avail_idx ||
> > > +           (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||
> > 
> > Is the above really worth the complexity?
> > Guests can't rely on this kind of fuzzy logic, can they?
> > Do you see this helping performance at all?
> 
>         It definitely hurts performance to allocate max-sized buffers
> and then free up the excess (the translate_desc is expensive).

Well, we do not need to rerun translate_desc, we can keep
the results around, however ...

> It is
> a heuristic, but it's one that keeps the notifies flowing without
> having to fail on a packet to reenable them.

I don't understand completely. We looked into the skb, so
we know the required length?

>         The notify code has to change because there is no recovery
> if we need 5 buffers and have only 3, while notification is
> disabled unless we go all the way to 0. We can't honor NOTIFY_ON_EMPTY
> in that case because it will never proceed. We can remove the
> NOTIFY_ON_EMPTY check in vhost_signal and do it in the caller; then allow
> receiver to fail a buffer allocation and signal unconditionally,
> but we cannot wait for it to be 0 even if NOTIFY_ON_EMPTY is set--
> that's a deadlock.
> 

Which guest kernel or spec are you looking at? As far as I know
F_NOTIFY_ON_EMPTY was only used for TX at some point.
I am not aware of this bit having the meaning you assign it.
I suspect that what you outline above simply means guest
can not combine F_NOTIFY_ON_EMPTY with mergeable buffers for
virtio net RX VQ.

> > >  /* OK, now we need to know about added descriptors. */
> > > -bool vhost_enable_notify(struct vhost_virtqueue *vq)
> > > +bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
> > >  {
> > >         u16 avail_idx;
> > >         int r;
> > > @@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
> > >                 return false;
> > >         }
> > > 
> > > +       if (recvr && vq->rxmaxheadcount)
> > > +               return (avail_idx - vq->last_avail_idx) >= 
> > > vq->rxmaxheadcount;
> > 
> > The fuzzy logic behind rxmaxheadcount might be a good
> > optimization, but I am not comfortable using it
> > for correctness. Maybe vhost_enable_notify should get
> > the last head so we can redo poll when another one
> > is added.
> 
>         I tried requiring at least 64K here always, but as
> I said before, the cost of doing the translate_desc and
> then throwing those away for small packets killed performance.
> I also considered using a highwater mark on the ring, but
> potentially differing buffer sizes makes that less useful,
> and a small ring may only have enough space for 1 max-sized
> packet. 
>         Maybe we should just remove NOTIFY_ON_EMPTY if we
> fail a packet allocation; I can try that out.
>         Another alternative I considered was splitting out
> the translate_desc stuff in the hopes that we could found
> out how many buffers we need in a less-expensive way; then
> we could go for max-sized and free the excess without too
> much overhead, maybe.
>         Anyway, I'll look harder at working around this.

I am kind of confused. We did a peek and have the skb length,
why do we need the maxheads?

Hmm, the iovec consumed is not just skb length, it is skb length + vnet
header. Is this it? Maybe we can just export this from socket somehow?

> > >         struct iovec indirect[VHOST_NET_MAX_SG];
> > > -       struct iovec iov[VHOST_NET_MAX_SG];
> > > -       struct iovec hdr[VHOST_NET_MAX_SG];
> > > -       size_t hdr_size;
> > > +       struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr 
> */
> > 
> > VHOST_NET_MAX_SG should already include iovec vnet hdr.
> 
>         That's actually an artifact from the previous patch. I
> was arranging the iovec to have just the header needed by the
> guest and this simplified the bounds checking. It can be
> removed if we leave header size management to the socket side,
> but if vhost is going to be doing vnet_hdr manipulation for
> raw sockets, then it'll depend on what that new code needs
> to do to manage the header, I suppose.
> 
>                                                 +-DLS

Note it doesn't manipluate the header, just skips it.

-- 
MST

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-04-01 10:54             ` Michael S. Tsirkin
@ 2010-04-01 18:22               ` David Stevens
  2010-04-04  8:55                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: David Stevens @ 2010-04-01 18:22 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, rusty, virtualization

kvm-owner@vger.kernel.org wrote on 04/01/2010 03:54:15 AM:

> On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:

> > 
> > > > +               head.iov_base = (void 
*)vhost_get_vq_desc(&net->dev, 
> > vq,
> > > > +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, 
NULL, 
> > 
> > > > NULL);
> > > 
> > > I this casting confusing.
> > > Is it really expensive to add an array of heads so that
> > > we do not need to cast?
> > 
> >         It needs the heads and the lengths, which looks a lot
> > like an iovec. I was trying to resist adding a new
> > struct XXX { unsigned head; unsigned len; } just for this,
> > but I could make these parallel arrays, one with head index and
> > the other with length.

        Michael, on this one, if I add vq->heads as an argument to
vhost_get_heads (aka vhost_get_desc_n), I'd need the length too.
Would you rather this 1) remain an iovec (and a single arg added) but
cast still there, 2) 2 arrays (head and length) and 2 args added, or
3) a new struct type of {unsigned,int} to carry for the heads+len
instead of iovec?
        My preference would be 1). I agree the casts are ugly, but
it is essentially an iovec the way we use it; it's just that the
base isn't a pointer but a descriptor index instead.

> > 
> >         EAGAIN is not possible after the change, because we don't
> > even enter the loop unless we have an skb on the read queue; the
> > other cases bomb out, so I figured the comment for future work is
> > now done. :-)
> 
> Guest could be buggy so we'll get EFAULT.
> If skb is taken off the rx queue (as below), we might get EAGAIN.

        We break on any error. If we get EAGAIN because someone read
on the socket, this code would break the loop, but EAGAIN is a more
serious problem if it changed since we peeked (because it means
someone else is reading the socket).
        But I don't understand -- are you suggesting that the error
handling be different than that, or that the comment is still
relevant? My intention here is to do the "TODO" from the comment
so that it can be removed, by handling all error cases. I think
because of the peek, EAGAIN isn't something to be ignored anymore,
but the effect is the same whether we break out of the loop or
not, since we retry the packet next time around. Essentially, we
ignore every error since we will redo it with the same packet the
next time around. Maybe we should print something here, but since
we'll be retrying the packet that's still on the socket, a permanent
error would spew continuously. Maybe we should shut down entirely
if we get any negative return value here (including EAGAIN, since
that tells us someone messed with the socket when we don't want them
to).
        If you want the comment still there, ok, but I do think EAGAIN
isn't a special case per the comment anymore, and is handled as all
other errors are: by exiting the loop and retrying next time.

                                                                +-DLS


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
  2010-04-01 18:22               ` David Stevens
@ 2010-04-04  8:55                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 10+ messages in thread
From: Michael S. Tsirkin @ 2010-04-04  8:55 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, kvm-owner, netdev, rusty, virtualization

On Thu, Apr 01, 2010 at 11:22:37AM -0700, David Stevens wrote:
> kvm-owner@vger.kernel.org wrote on 04/01/2010 03:54:15 AM:
> 
> > On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:
> 
> > > 
> > > > > +               head.iov_base = (void 
> *)vhost_get_vq_desc(&net->dev, 
> > > vq,
> > > > > +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, 
> NULL, 
> > > 
> > > > > NULL);
> > > > 
> > > > I this casting confusing.
> > > > Is it really expensive to add an array of heads so that
> > > > we do not need to cast?
> > > 
> > >         It needs the heads and the lengths, which looks a lot
> > > like an iovec. I was trying to resist adding a new
> > > struct XXX { unsigned head; unsigned len; } just for this,
> > > but I could make these parallel arrays, one with head index and
> > > the other with length.
> 
>         Michael, on this one, if I add vq->heads as an argument to
> vhost_get_heads (aka vhost_get_desc_n), I'd need the length too.
> Would you rather this 1) remain an iovec (and a single arg added) but
> cast still there, 2) 2 arrays (head and length) and 2 args added, or
> 3) a new struct type of {unsigned,int} to carry for the heads+len
> instead of iovec?
>         My preference would be 1). I agree the casts are ugly, but
> it is essentially an iovec the way we use it; it's just that the
> base isn't a pointer but a descriptor index instead.

I prefer 2 or 3. If you prefer 1 strongly, I think we should
add a detailed comment near the iovec, and
a couple of inline wrappers to store/get data in the iovec.

> > > 
> > >         EAGAIN is not possible after the change, because we don't
> > > even enter the loop unless we have an skb on the read queue; the
> > > other cases bomb out, so I figured the comment for future work is
> > > now done. :-)
> > 
> > Guest could be buggy so we'll get EFAULT.
> > If skb is taken off the rx queue (as below), we might get EAGAIN.
> 
>         We break on any error. If we get EAGAIN because someone read
> on the socket, this code would break the loop, but EAGAIN is a more
> serious problem if it changed since we peeked (because it means
> someone else is reading the socket).
>         But I don't understand -- are you suggesting that the error
> handling be different than that, or that the comment is still
> relevant?
> My intention here is to do the "TODO" from the comment
> so that it can be removed, by handling all error cases. I think
> because of the peek, EAGAIN isn't something to be ignored anymore,
> but the effect is the same whether we break out of the loop or
> not, since we retry the packet next time around. Essentially, we
> ignore every error since we will redo it with the same packet the
> next time around. Maybe we should print something here, but since
> we'll be retrying the packet that's still on the socket, a permanent
> error would spew continuously. Maybe we should shut down entirely
> if we get any negative return value here (including EAGAIN, since
> that tells us someone messed with the socket when we don't want them
> to).
>         If you want the comment still there, ok, but I do think EAGAIN
> isn't a special case per the comment anymore, and is handled as all
> other errors are: by exiting the loop and retrying next time.
> 
>                                                                 +-DLS

Yes, I just think some comment should stay, as you say, because
otherwise we simply retry continuously. Maybe we should trigger vq_err.
It needs to be given some thought which I have not given it yet.

Thinking aloud, EAGAIN means someone reads the socket
together with us, I prefer that this condition is made a fatal
error, we should make sure we are polling the socket
so we see packets if more appear.

-- 
MST

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-04-04  8:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-03  0:20 [RFC][ PATCH 3/3] vhost-net: Add mergeable RX buffer support to vhost-net David Stevens
2010-03-07 16:26 ` Michael S. Tsirkin
2010-03-08  2:06   ` David Stevens
2010-03-08  8:07     ` Michael S. Tsirkin
2010-03-31  1:23       ` [PATCH v2] Add Mergeable RX buffer feature to vhost_net David Stevens
2010-03-31 12:02         ` Michael S. Tsirkin
2010-03-31 22:04           ` David Stevens
2010-04-01 10:54             ` Michael S. Tsirkin
2010-04-01 18:22               ` David Stevens
2010-04-04  8:55                 ` Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).