Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH V3 14/15] vhost: event suppression for packed ring
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

This patch introduces support for event suppression. This is done by
have a two areas: device area and driver area. One side could then try
to disable or enable (delayed) notification from other side by using a
boolean hint or event index interface in the areas.

For more information, please refer Virtio spec.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 265 +++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h |  11 +-
 2 files changed, 255 insertions(+), 21 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a7d24b9d5204..a188e9af3b35 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1524,6 +1524,76 @@ static inline int vhost_put_desc_flags(struct vhost_virtqueue *vq,
 	return vhost_put_user(vq, *flags, &desc->flags, VHOST_ADDR_DESC);
 }
 
+static int vhost_get_driver_off_wrap(struct vhost_virtqueue *vq,
+				     __virtio16 *off_wrap)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vring_packed_desc_event *event =
+	       vhost_get_meta_ptr(vq, VHOST_ADDR_AVAIL);
+	if (likely(event)) {
+		*off_wrap = event->off_wrap;
+		vhost_put_meta_ptr();
+		return 0;
+	}
+#endif
+	return vhost_get_user(vq, *off_wrap,
+			      &vq->driver_event->off_wrap,
+			      VHOST_ADDR_AVAIL);
+}
+
+static int vhost_get_driver_flags(struct vhost_virtqueue *vq,
+				  __virtio16 *driver_flags)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vring_packed_desc_event *event =
+	       vhost_get_meta_ptr(vq, VHOST_ADDR_AVAIL);
+
+	if (likely(event)) {
+		*driver_flags = event->flags;
+		vhost_put_meta_ptr();
+		return 0;
+	}
+#endif
+	return vhost_get_user(vq, *driver_flags, &vq->driver_event->flags,
+			      VHOST_ADDR_AVAIL);
+}
+
+static int vhost_put_device_off_wrap(struct vhost_virtqueue *vq,
+				     __virtio16 *off_wrap)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vring_packed_desc_event *event =
+	       vhost_get_meta_ptr(vq, VHOST_ADDR_USED);
+
+	if (likely(event)) {
+		event->off_wrap = *off_wrap;
+		vhost_put_meta_ptr();
+		return 0;
+	}
+#endif
+	return vhost_put_user(vq, *off_wrap,
+			      &vq->device_event->off_wrap,
+			      VHOST_ADDR_USED);
+}
+
+static int vhost_put_device_flags(struct vhost_virtqueue *vq,
+				  __virtio16 *device_flags)
+{
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vring_packed_desc_event *event =
+	       vhost_get_meta_ptr(vq, VHOST_ADDR_USED);
+
+	if (likely(event)) {
+		event->flags = *device_flags;
+		vhost_put_meta_ptr();
+		return 0;
+	}
+#endif
+	return vhost_put_user(vq, *device_flags,
+			      &vq->device_event->flags,
+			      VHOST_ADDR_USED);
+}
+
 static int vhost_new_umem_range(struct vhost_umem *umem,
 				u64 start, u64 size, u64 end,
 				u64 userspace_addr, int perm)
@@ -1809,10 +1879,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
 			       struct vring_used __user *used)
 {
 	struct vring_packed_desc *packed = (struct vring_packed_desc *)desc;
+	struct vring_packed_desc_event *driver_event =
+		(struct vring_packed_desc_event *)avail;
+	struct vring_packed_desc_event *device_event =
+		(struct vring_packed_desc_event *)used;
 
-	/* TODO: check device area and driver area */
 	return access_ok(packed, num * sizeof(*packed)) &&
-	       access_ok(packed, num * sizeof(*packed));
+	       access_ok(packed, num * sizeof(*packed)) &&
+	       access_ok(driver_event, sizeof(*driver_event)) &&
+	       access_ok(device_event, sizeof(*device_event));
 }
 
 static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1904,16 +1979,25 @@ static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
 }
 #endif
 
-int vq_meta_prefetch(struct vhost_virtqueue *vq)
+static int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
 {
-	unsigned int num = vq->num;
+	int num = vq->num;
 
-	if (!vq->iotlb) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		vhost_vq_map_prefetch(vq);
-#endif
-		return 1;
-	}
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO,
+			       (u64)(uintptr_t)vq->driver_event,
+			       sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO,
+			       (u64)(uintptr_t)vq->device_event,
+			       sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+static int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
+{
+	unsigned int num = vq->num;
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO,
 			       (u64)(uintptr_t)vq->desc,
@@ -1928,6 +2012,21 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq)
 			       vhost_get_used_size(vq, num),
 			       VHOST_ADDR_USED);
 }
+
+int vq_meta_prefetch(struct vhost_virtqueue *vq)
+{
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
+		return 1;
+	}
+
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vq_iotlb_prefetch_packed(vq);
+	else
+		return vq_iotlb_prefetch_split(vq);
+}
 EXPORT_SYMBOL_GPL(vq_meta_prefetch);
 
 /* Can we log writes? */
@@ -2620,6 +2719,48 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 	return 0;
 }
 
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+				     __virtio16 device_flags)
+{
+	void __user *flags;
+
+	if (vhost_put_device_flags(vq, &device_flags))
+		return -EFAULT;
+	if (unlikely(vq->log_used)) {
+		/* Make sure the flag is seen before log. */
+		smp_wmb();
+		/* Log used flag write. */
+		flags = &vq->device_event->flags;
+		log_write(vq->log_base, vq->log_addr +
+			  (flags - (void __user *)vq->device_event),
+			  sizeof(vq->device_event->flags));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	return 0;
+}
+
+static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq,
+					__virtio16 device_off_wrap)
+{
+	void __user *off_wrap;
+
+	if (vhost_put_device_off_wrap(vq, &device_off_wrap))
+		return -EFAULT;
+	if (unlikely(vq->log_used)) {
+		/* Make sure the flag is seen before log. */
+		smp_wmb();
+		/* Log used flag write. */
+		off_wrap = &vq->device_event->off_wrap;
+		log_write(vq->log_base, vq->log_addr +
+			  (off_wrap - (void __user *)vq->device_event),
+			  sizeof(vq->device_event->off_wrap));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	return 0;
+}
+
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
 	if (vhost_put_avail_event(vq))
@@ -3689,16 +3830,13 @@ int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
-static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify_split(struct vhost_dev *dev,
+			       struct vhost_virtqueue *vq)
 {
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
 
-	/* TODO: check driver area */
-	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
-		return true;
-
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
@@ -3731,6 +3869,62 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
 }
 
+static bool vhost_notify_packed(struct vhost_dev *dev,
+				struct vhost_virtqueue *vq)
+{
+	__virtio16 event_off_wrap, event_flags;
+	__u16 old, new, off_wrap;
+	bool v;
+
+	/* Flush out used descriptors updates. This is paired
+	 * with the barrier that the Guest executes when enabling
+	 * interrupts.
+	 */
+	smp_mb();
+
+	if (vhost_get_driver_flags(vq, &event_flags)) {
+		vq_err(vq, "Failed to get driver desc_event_flags");
+		return true;
+	}
+
+	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX))
+		return event_flags !=
+		       cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_DISABLE);
+
+	old = vq->signalled_used;
+	v = vq->signalled_used_valid;
+	new = vq->signalled_used = vq->last_used_idx;
+	vq->signalled_used_valid = true;
+
+	if (event_flags != cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_DESC))
+		return event_flags !=
+		       cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_DISABLE);
+
+	/* Read desc event flags before event_off and event_wrap */
+	smp_rmb();
+
+	if (vhost_get_driver_off_wrap(vq, &event_off_wrap) < 0) {
+		vq_err(vq, "Failed to get driver desc_event_off/wrap");
+		return true;
+	}
+
+	off_wrap = vhost16_to_cpu(vq, event_off_wrap);
+
+	if (unlikely(!v))
+		return true;
+
+	return vhost_vring_packed_need_event(vq, vq->last_used_wrap_counter,
+					     off_wrap, new, old);
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_notify_packed(dev, vq);
+	else
+		return vhost_notify_split(dev, vq);
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
@@ -3836,10 +4030,34 @@ EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 static bool vhost_enable_notify_packed(struct vhost_virtqueue *vq)
 {
 	struct vring_packed_desc *d = vq->desc_packed + vq->avail_idx;
-	__virtio16 flags;
+	__virtio16 flags = cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_ENABLE);
 	int ret;
 
-	/* TODO: enable notification through device area */
+	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+		return false;
+	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+	if (vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
+		__virtio16 off_wrap = cpu_to_vhost16(vq, vq->avail_idx |
+				      vq->avail_wrap_counter << 15);
+
+		ret = vhost_update_device_off_wrap(vq, off_wrap);
+		if (ret) {
+			vq_err(vq, "Failed to write to off warp at %p: %d\n",
+			       &vq->device_event->off_wrap, ret);
+			return false;
+		}
+		/* Make sure off_wrap is wrote before flags */
+		smp_wmb();
+		flags = cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_DESC);
+	}
+
+	ret = vhost_update_device_flags(vq, flags);
+	if (ret) {
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+			&vq->device_event->flags, ret);
+		return false;
+	}
 
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again.
@@ -3901,7 +4119,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
 static void vhost_disable_notify_packed(struct vhost_virtqueue *vq)
 {
-	/* TODO: disable notification through device area */
+	__virtio16 flags;
+	int r;
+
+	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+		return;
+	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	flags = cpu_to_vhost16(vq, VRING_PACKED_EVENT_FLAG_DISABLE);
+	r = vhost_update_device_flags(vq, flags);
+	if (r)
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+		       &vq->device_event->flags, r);
 }
 
 static void vhost_disable_notify_split(struct vhost_virtqueue *vq)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 7f3a2dd1b628..bb3f8bb763b9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -125,9 +125,14 @@ struct vhost_virtqueue {
 		struct vring_desc __user *desc;
 		struct vring_packed_desc __user *desc_packed;
 	};
-	struct vring_avail __user *avail;
-	struct vring_used __user *used;
-
+	union {
+		struct vring_avail __user *avail;
+		struct vring_packed_desc_event __user *driver_event;
+	};
+	union {
+		struct vring_used __user *used;
+		struct vring_packed_desc_event __user *device_event;
+	};
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 	/* Read by memory accessors, modified by meta data
 	 * prefetching, MMU notifier and vring ioctl().
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 12/15] vhost: vhost_put_user() can accept metadata type
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6044cdea124f..3fa1adf2cb90 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1178,7 +1178,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr)		\
+#define vhost_put_user(vq, x, ptr, type)		\
 ({ \
 	int ret = -EFAULT; \
 	if (!vq->iotlb) { \
@@ -1186,7 +1186,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	} else { \
 		__typeof__(ptr) to = \
 			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
-					  sizeof(*ptr), VHOST_ADDR_USED); \
+					  sizeof(*ptr), type); \
 		if (to != NULL) \
 			ret = __put_user(x, to); \
 		else \
@@ -1230,7 +1230,7 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 #endif
 
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-			      vhost_avail_event(vq));
+			      vhost_avail_event(vq), VHOST_ADDR_USED);
 }
 
 static inline int vhost_put_used(struct vhost_virtqueue *vq,
@@ -1267,7 +1267,7 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 #endif
 
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-			      &vq->used->flags);
+			      &vq->used->flags, VHOST_ADDR_USED);
 }
 
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
@@ -1284,7 +1284,7 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 #endif
 
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-			      &vq->used->idx);
+			      &vq->used->idx, VHOST_ADDR_USED);
 }
 
 #define vhost_get_user(vq, x, ptr, type)		\
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 10/15] vhost: hide used ring layout from device
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 88 ++++++++++++++++++++++---------------------
 drivers/vhost/scsi.c  | 62 ++++++++++++++++--------------
 drivers/vhost/vhost.c | 38 +++++++++++--------
 drivers/vhost/vhost.h | 11 +++---
 drivers/vhost/vsock.c | 43 +++++++++++----------
 5 files changed, 129 insertions(+), 113 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9e087d08b199..572d80c8c36e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -546,25 +546,28 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 }
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
-				    struct vhost_net_virtqueue *tnvq,
+				    struct vring_used_elem *used_elem,
 				    unsigned int *out_num, unsigned int *in_num,
 				    struct msghdr *msghdr, bool *busyloop_intr)
 {
+	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
 	struct vhost_virtqueue *rvq = &rnvq->vq;
 	struct vhost_virtqueue *tvq = &tnvq->vq;
 
-	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+	int r = vhost_get_vq_desc(tvq, used_elem, tvq->iov,
+				  ARRAY_SIZE(tvq->iov),
 				  out_num, in_num, NULL, NULL);
 
-	if (r == tvq->num && tvq->busyloop_timeout) {
+	if (r == -ENOSPC && tvq->busyloop_timeout) {
 		/* Flush batched packets first */
 		if (!vhost_sock_zcopy(tvq->private_data))
 			vhost_tx_batch(net, tnvq, tvq->private_data, msghdr);
 
 		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
 
-		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+		r = vhost_get_vq_desc(tvq, used_elem, tvq->iov,
+				      ARRAY_SIZE(tvq->iov),
 				      out_num, in_num, NULL, NULL);
 	}
 
@@ -593,6 +596,7 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
 }
 
 static int get_tx_bufs(struct vhost_net *net,
+		       struct vring_used_elem *used_elem,
 		       struct vhost_net_virtqueue *nvq,
 		       struct msghdr *msg,
 		       unsigned int *out, unsigned int *in,
@@ -601,9 +605,10 @@ static int get_tx_bufs(struct vhost_net *net,
 	struct vhost_virtqueue *vq = &nvq->vq;
 	int ret;
 
-	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+	ret = vhost_net_tx_get_vq_desc(net, used_elem, out, in,
+				       msg, busyloop_intr);
 
-	if (ret < 0 || ret == vq->num)
+	if (ret < 0 || ret == -ENOSPC)
 		return ret;
 
 	if (*in) {
@@ -747,8 +752,8 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -767,13 +772,11 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 		if (vhost_get_shadow_used_count(vq) == VHOST_NET_BATCH)
 			vhost_tx_batch(net, nvq, sock, &msg);
 
-		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-				   &busyloop_intr);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = get_tx_bufs(net, &used,
+				  nvq, &msg, &out, &in, &len,
+				  &busyloop_intr);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
 			} else if (unlikely(vhost_enable_notify(vq))) {
@@ -782,7 +785,9 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			}
 			break;
 		}
-
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 		total_len += len;
 
 		/* For simplicity, TX batching is only enabled if
@@ -823,7 +828,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			pr_debug("Truncated TX packet: len %d != %zd\n",
 				 err, len);
 done:
-		vhost_add_shadow_used(vq, cpu_to_vhost32(vq, head), 0);
+		vhost_add_shadow_used(vq, &used, 0);
 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 
 	vhost_tx_batch(net, nvq, sock, &msg);
@@ -834,7 +839,6 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -843,6 +847,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		.msg_flags = MSG_DONTWAIT,
 	};
 	struct tun_msg_ctl ctl;
+	struct vring_used_elem used;
 	size_t len, total_len = 0;
 	int err;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -856,13 +861,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		vhost_zerocopy_signal_used(net, vq);
 
 		busyloop_intr = false;
-		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
-				   &busyloop_intr);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = get_tx_bufs(net, &used, nvq, &msg, &out, &in, &len,
+				  &busyloop_intr);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
 			} else if (unlikely(vhost_enable_notify(vq))) {
@@ -871,6 +873,9 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 
 		zcopy_used = len >= VHOST_GOODCOPY_LEN
 			     && !vhost_exceeds_maxpend(net)
@@ -895,7 +900,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			ubufs = NULL;
 		}
 		vhost_set_zc_used(vq, nvq->upend_idx,
-				  cpu_to_vhost32(vq, head),
+				  &used,
 				  zcopy_used ? VHOST_DMA_IN_PROGRESS :
 				  VHOST_DMA_DONE_LEN);
 		nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
@@ -921,7 +926,6 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-
 		vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
@@ -1012,34 +1016,30 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		       unsigned *iovcount,
 		       struct vhost_log *log,
 		       unsigned *log_num,
-		       unsigned int quota)
+		       unsigned int quota,
+		       s16 *count)
 {
 	unsigned int out, in;
 	int seg = 0;
 	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
+	int r = 0, nlogs = 0;
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
 	u32 uninitialized_var(len);
+	struct vring_used_elem uninitialized_var(used);
 
 	while (datalen > 0 && headcount < quota) {
 		if (unlikely(seg >= UIO_MAXIOV)) {
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
+		r = vhost_get_vq_desc(vq, &used, vq->iov + seg,
 				      ARRAY_SIZE(vq->iov) - seg, &out,
 				      &in, log, log_num);
 		if (unlikely(r < 0))
 			goto err;
 
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
 		if (unlikely(out || in <= 0)) {
 			vq_err(vq, "unexpected descriptor format for RX: "
 				"out %d, in %d\n", out, in);
@@ -1052,7 +1052,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		}
 		len = iov_length(vq->iov + seg, in);
 		datalen -= len;
-		vhost_add_shadow_used(vq, cpu_to_vhost32(vq, d),
+		vhost_add_shadow_used(vq, &used,
 				      cpu_to_vhost32(vq, datalen >= 0 ? len
 						     : len + datalen));
 		++headcount;
@@ -1064,12 +1064,15 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 
 	/* Detect overrun */
 	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
+		headcount = UIO_MAXIOV + 1;
 		goto err;
 	}
-	return headcount;
+
+	*count = headcount;
+	return 0;
 err:
 	vhost_discard_shadow_used(vq, headcount);
+	*count = 0;
 	return r;
 }
 
@@ -1128,13 +1131,11 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vhost_len, &in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(headcount < 0))
-			goto out;
+		err = get_rx_bufs(vq, vhost_len, &in, vq_log, &log,
+				  likely(mergeable) ? UIO_MAXIOV : 1,
+				  &headcount);
 		/* OK, now we need to know about added descriptors. */
-		if (!headcount) {
+		if (err == -ENOSPC) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
 			} else if (unlikely(vhost_enable_notify(vq))) {
@@ -1148,6 +1149,9 @@ static void handle_rx(struct vhost_net *net)
 			goto out;
 		}
 		busyloop_intr = false;
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			goto out;
 		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 8d4e87007a8d..4a5a75ab25ad 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -72,7 +72,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	int tvc_vq_desc;
+	struct vring_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -213,7 +213,7 @@ struct vhost_scsi {
  * Context for processing request and control queue operations.
  */
 struct vhost_scsi_ctx {
-	int head;
+	struct vring_used_elem head;
 	unsigned int out, in;
 	size_t req_size, rsp_size;
 	size_t out_size, in_size;
@@ -449,8 +449,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head, ret;
+	int ret;
 
 	if (!vq->private_data) {
 		vs->vs_events_missed = true;
@@ -459,16 +460,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 
 again:
 	vhost_disable_notify(vq);
-	head = vhost_get_vq_desc(vq, vq->iov,
+	ret = vhost_get_vq_desc(vq, &used, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
-	if (head < 0) {
+	if (ret == -ENOSPC) {
+		if (vhost_enable_notify(&vs->dev, vq))
+			goto again;
 		vs->vs_events_missed = true;
 		return;
 	}
-	if (head == vq->num) {
-		if (vhost_enable_notify(vq))
-			goto again;
+	if (ret < 0) {
 		vs->vs_events_missed = true;
 		return;
 	}
@@ -488,7 +489,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	eventp = vq->iov[out].iov_base;
 	ret = __copy_to_user(eventp, event, sizeof(*event));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
 	else
 		vq_err(vq, "Faulted on vhost_scsi_send_event\n");
 }
@@ -549,7 +550,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
 			struct vhost_scsi_virtqueue *q;
-			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+			vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
 			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
 			vq = q - vs->vqs;
 			__set_bit(vq, signal);
@@ -793,7 +794,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   int head, unsigned out)
+			   struct vhost_scsi_ctx *vc)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -801,10 +802,10 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 
 	memset(&rsp, 0, sizeof(rsp));
 	rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
-	resp = vq->iov[out].iov_base;
+	resp = vq->iov[vc->out].iov_base;
 	ret = __copy_to_user(resp, &rsp, sizeof(rsp));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &vc->head, 0);
 	else
 		pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
@@ -813,21 +814,17 @@ static int
 vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
 		    struct vhost_scsi_ctx *vc)
 {
-	int ret = -ENXIO;
+	int ret;
 
-	vc->head = vhost_get_vq_desc(vq, vq->iov,
-				     ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
-				     NULL, NULL);
+	ret = vhost_get_vq_desc(vq, &vc->head, vq->iov,
+				ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
+				NULL, NULL);
 
 	pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-		 vc->head, vc->out, vc->in);
-
-	/* On error, stop handling until the next kick. */
-	if (unlikely(vc->head < 0))
-		goto done;
+		 vc->head.id, vc->out, vc->in);
 
 	/* Nothing new?  Wait for eventfd to tell us they refilled. */
-	if (vc->head == vq->num) {
+	if (ret == -ENOSPC) {
 		if (unlikely(vhost_enable_notify(vq))) {
 			vhost_disable_notify(vq);
 			ret = -EAGAIN;
@@ -835,6 +832,10 @@ vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
 		goto done;
 	}
 
+	/* On error, stop handling until the next kick. */
+	if (unlikely(ret < 0))
+		goto done;
+
 	/*
 	 * Get the size of request and response buffers.
 	 * FIXME: Not correct for BIDI operation
@@ -1025,6 +1026,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 					vq_err(vq, "Received non zero pi_bytesin,"
 						" but wrong data_direction\n");
 					goto err;
+
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
 			}
@@ -1097,7 +1099,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 * complete the virtio-scsi request in TCM callback context via
 		 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
 		 */
-		cmd->tvc_vq_desc = vc.head;
+		cmd->tvc_vq_used = vc.head;
 		/*
 		 * Dispatch cmd descriptor for cmwq execution in process
 		 * context provided by vhost_scsi_workqueue.  This also ensures
@@ -1117,8 +1119,10 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (ret == -ENXIO)
 			break;
 		else if (ret == -EIO)
-			vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+			vhost_scsi_send_bad_target(vs, vq, &vc);
+
 	} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
+
 out:
 	mutex_unlock(&vq->mutex);
 }
@@ -1140,7 +1144,7 @@ vhost_scsi_send_tmf_reject(struct vhost_scsi *vs,
 
 	ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
 	if (likely(ret == sizeof(rsp)))
-		vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &vc->head, 0);
 	else
 		pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n");
 }
@@ -1162,7 +1166,7 @@ vhost_scsi_send_an_resp(struct vhost_scsi *vs,
 
 	ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
 	if (likely(ret == sizeof(rsp)))
-		vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &vc->head, 0);
 	else
 		pr_err("Faulted on virtio_scsi_ctrl_an_resp\n");
 }
@@ -1269,8 +1273,10 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (ret == -ENXIO)
 			break;
 		else if (ret == -EIO)
-			vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+			vhost_scsi_send_bad_target(vs, vq, &vc);
+
 	} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
+
 out:
 	mutex_unlock(&vq->mutex);
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 50ba382f0981..dbe4db0179a5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2651,6 +2651,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vring_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -2683,7 +2684,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		 * invalid.
 		 */
 		if (vq->avail_idx == last_avail_idx)
-			return vq->num;
+			return -ENOSPC;
 
 		/* Only get avail ring entries after they have been
 		 * exposed by guest.
@@ -2700,6 +2701,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	used->id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2787,10 +2789,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
-	return head;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+			       struct vring_used_elem *used, int len)
+{
+	used->len = cpu_to_vhost32(vq, len);
+}
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
 static void vhost_withdraw_shadow_used(struct vhost_virtqueue *vq, int count)
 {
 	BUG_ON(count > vq->nheads);
@@ -2860,17 +2869,17 @@ int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx)
 EXPORT_SYMBOL_GPL(vhost_get_zc_used_len);
 
 void vhost_set_zc_used(struct vhost_virtqueue *vq,
-			   int idx, unsigned int head, int len)
+		       int idx, struct vring_used_elem *elem, int len)
 {
-	vq->heads[idx].id = head;
-	vq->heads[idx].len = len;
+	vq->heads[idx] = *elem;
+	vhost_set_zc_used_len(vq, idx, len);
 }
 EXPORT_SYMBOL_GPL(vhost_set_zc_used);
 
 void vhost_add_shadow_used(struct vhost_virtqueue *vq,
-			   unsigned int head, int len)
+			   struct vring_used_elem *elem, int len)
 {
-	vhost_set_zc_used(vq, vq->nheads, head, len);
+	vhost_set_zc_used(vq, vq->nheads, elem, len);
 	++vq->nheads;
 }
 EXPORT_SYMBOL_GPL(vhost_add_shadow_used);
@@ -2921,14 +2930,11 @@ EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+		   int len)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
-
-	return vhost_add_used_n(vq, &heads, 1);
+	vhost_set_used_len(vq, used, len);
+	return vhost_add_used_n(vq, used, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
@@ -2981,9 +2987,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct vring_used_elem *used, int len)
 {
-	vhost_add_used(vq, head, len);
+	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 481baba20c3d..f835eefa240c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -233,16 +233,17 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
+		      struct vring_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem *, int);
 
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int id, int len);
+			       struct vring_used_elem *, int);
 
 /* Zerocopy shadow used ring API */
 void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
@@ -250,11 +251,11 @@ void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
 int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx);
 void vhost_flush_zc_used_and_signal(struct vhost_virtqueue *vq, int idx, int n);
 void vhost_set_zc_used(struct vhost_virtqueue *vq, int idx,
-		       unsigned int head, int len);
+		       struct vring_used_elem *elem, int len);
 
 /* Non zerocopy shadow used ring API */
-void vhost_add_shadow_used(struct vhost_virtqueue *vq, unsigned int head,
-			   int len);
+void vhost_add_shadow_used(struct vhost_virtqueue *vq,
+			   struct vring_used_elem *elem, int len);
 void vhost_flush_shadow_used_and_signal(struct vhost_virtqueue *vq);
 void vhost_discard_shadow_used(struct vhost_virtqueue *vq, int n);
 int vhost_get_shadow_used_count(struct vhost_virtqueue *vq);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f94021b450f0..1c962bfdc3a1 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -100,11 +100,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	do {
 		struct virtio_vsock_pkt *pkt;
+		struct vring_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
 		size_t len;
-		int head;
+		int ret;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -118,16 +119,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -141,6 +135,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			}
 			break;
 		}
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
 
 		if (out) {
 			virtio_transport_free_pkt(pkt);
@@ -148,7 +148,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
+		len = vhost32_to_cpu(vq, used.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -165,7 +165,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
 		added = true;
 
 		if (pkt->reply) {
@@ -360,7 +360,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	int head, pkts = 0, total_len = 0;
+	struct vring_used_elem used;
+	int ret, pkts = 0, total_len = 0;
 	unsigned int out, in;
 	bool added = false;
 
@@ -381,18 +382,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			goto no_more_replies;
 		}
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
 		}
+		if (ret < 0)
+			break;
 
 		pkt = vhost_vsock_alloc_pkt(vq, out, in);
 		if (!pkt) {
@@ -411,8 +411,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 		else
 			virtio_transport_free_pkt(pkt);
 
-		len += sizeof(pkt->hdr);
-		vhost_add_used(vq, head, len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
 		total_len += len;
 		added = true;
 	} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 11/15] vhost: do not use vring_used_elem
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 10 +++++-----
 drivers/vhost/scsi.c  |  8 ++++----
 drivers/vhost/vhost.c | 38 +++++++++++++++++++++++---------------
 drivers/vhost/vhost.h | 21 +++++++++++++++------
 drivers/vhost/vsock.c |  4 ++--
 5 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 572d80c8c36e..7c2f320930c7 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -546,7 +546,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 }
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
-				    struct vring_used_elem *used_elem,
+				    struct vhost_used_elem *used_elem,
 				    unsigned int *out_num, unsigned int *in_num,
 				    struct msghdr *msghdr, bool *busyloop_intr)
 {
@@ -596,7 +596,7 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
 }
 
 static int get_tx_bufs(struct vhost_net *net,
-		       struct vring_used_elem *used_elem,
+		       struct vhost_used_elem *used_elem,
 		       struct vhost_net_virtqueue *nvq,
 		       struct msghdr *msg,
 		       unsigned int *out, unsigned int *in,
@@ -752,7 +752,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	unsigned out, in;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -847,7 +847,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		.msg_flags = MSG_DONTWAIT,
 	};
 	struct tun_msg_ctl ctl;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	size_t len, total_len = 0;
 	int err;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -1027,7 +1027,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	 * datalen > 0.
 	 */
 	u32 uninitialized_var(len);
-	struct vring_used_elem uninitialized_var(used);
+	struct vhost_used_elem uninitialized_var(used);
 
 	while (datalen > 0 && headcount < quota) {
 		if (unlikely(seg >= UIO_MAXIOV)) {
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4a5a75ab25ad..42c32612dc32 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -72,7 +72,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	struct vring_used_elem tvc_vq_used;
+	struct vhost_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -213,7 +213,7 @@ struct vhost_scsi {
  * Context for processing request and control queue operations.
  */
 struct vhost_scsi_ctx {
-	struct vring_used_elem head;
+	struct vhost_used_elem head;
 	unsigned int out, in;
 	size_t req_size, rsp_size;
 	size_t out_size, in_size;
@@ -449,7 +449,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	unsigned out, in;
 	int ret;
 
@@ -821,7 +821,7 @@ vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
 				NULL, NULL);
 
 	pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-		 vc->head.id, vc->out, vc->in);
+		 vc->head.elem.id, vc->out, vc->in);
 
 	/* Nothing new?  Wait for eventfd to tell us they refilled. */
 	if (ret == -ENOSPC) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dbe4db0179a5..6044cdea124f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2651,7 +2651,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vring_used_elem *used,
+		      struct vhost_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -2701,7 +2701,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
-	used->id = ring_head;
+	used->elem.id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2793,13 +2793,20 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used, int len)
 {
-	used->len = cpu_to_vhost32(vq, len);
+	used->elem.len = cpu_to_vhost32(vq, len);
 }
 EXPORT_SYMBOL_GPL(vhost_set_used_len);
 
+__virtio32 vhost_get_used_len(struct vhost_virtqueue *vq,
+			      struct vhost_used_elem *used)
+{
+	return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
+
 static void vhost_withdraw_shadow_used(struct vhost_virtqueue *vq, int count)
 {
 	BUG_ON(count > vq->nheads);
@@ -2824,9 +2831,10 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
-			    struct vring_used_elem *heads,
+			    struct vhost_used_elem *shadow,
 			    unsigned count)
 {
+	struct vring_used_elem *heads = (struct vring_used_elem *)shadow;
 	struct vring_used_elem __user *used;
 	u16 old, new;
 	int start;
@@ -2858,18 +2866,18 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
 			       int idx, int len)
 {
-	vq->heads[idx].len = len;
+	vq->heads[idx].elem.len = len;
 }
 EXPORT_SYMBOL_GPL(vhost_set_zc_used_len);
 
 int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx)
 {
-	return vq->heads[idx].len;
+	return vq->heads[idx].elem.len;
 }
 EXPORT_SYMBOL_GPL(vhost_get_zc_used_len);
 
-void vhost_set_zc_used(struct vhost_virtqueue *vq,
-		       int idx, struct vring_used_elem *elem, int len)
+void vhost_set_zc_used(struct vhost_virtqueue *vq, int idx,
+		       struct vhost_used_elem *elem, int len)
 {
 	vq->heads[idx] = *elem;
 	vhost_set_zc_used_len(vq, idx, len);
@@ -2877,7 +2885,7 @@ void vhost_set_zc_used(struct vhost_virtqueue *vq,
 EXPORT_SYMBOL_GPL(vhost_set_zc_used);
 
 void vhost_add_shadow_used(struct vhost_virtqueue *vq,
-			   struct vring_used_elem *elem, int len)
+			   struct vhost_used_elem *elem, int len)
 {
 	vhost_set_zc_used(vq, vq->nheads, elem, len);
 	++vq->nheads;
@@ -2893,7 +2901,7 @@ EXPORT_SYMBOL_GPL(vhost_get_shadow_used_count);
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
 static int vhost_add_used_n(struct vhost_virtqueue *vq,
-			    struct vring_used_elem *heads,
+			    struct vhost_used_elem *heads,
 			    unsigned count)
 {
 	int start, n, r;
@@ -2930,7 +2938,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
 		   int len)
 {
 	vhost_set_used_len(vq, used, len);
@@ -2987,7 +2995,7 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
 	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
@@ -2997,7 +3005,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 /* multi-buffer version of vhost_add_used_and_signal */
 static void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 					struct vhost_virtqueue *vq,
-					struct vring_used_elem *heads,
+					struct vhost_used_elem *heads,
 					unsigned count)
 {
 	vhost_add_used_n(vq, heads, count);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f835eefa240c..b8a5d1a2bed9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -37,6 +37,10 @@ struct vhost_poll {
 	struct vhost_dev	 *dev;
 };
 
+struct vhost_used_elem {
+	struct vring_used_elem elem;
+};
+
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
@@ -162,7 +166,7 @@ struct vhost_virtqueue {
 	struct iovec iov[UIO_MAXIOV];
 	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
-	struct vring_used_elem *heads;
+	struct vhost_used_elem *heads;
 	int nheads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
@@ -233,17 +237,22 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
-		      struct vring_used_elem *used_elem,
+		      struct vhost_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem *, int);
+int vhost_add_used(struct vhost_virtqueue *, struct vhost_used_elem *, int);
 
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *, int);
+			       struct vhost_used_elem *, int);
+
+__virtio32 vhost_get_used_len(struct vhost_virtqueue *vq,
+			      struct vhost_used_elem *used);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used, int len);
 
 /* Zerocopy shadow used ring API */
 void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
@@ -251,11 +260,11 @@ void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
 int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx);
 void vhost_flush_zc_used_and_signal(struct vhost_virtqueue *vq, int idx, int n);
 void vhost_set_zc_used(struct vhost_virtqueue *vq, int idx,
-		       struct vring_used_elem *elem, int len);
+		       struct vhost_used_elem *elem, int len);
 
 /* Non zerocopy shadow used ring API */
 void vhost_add_shadow_used(struct vhost_virtqueue *vq,
-			   struct vring_used_elem *elem, int len);
+			   struct vhost_used_elem *elem, int len);
 void vhost_flush_shadow_used_and_signal(struct vhost_virtqueue *vq);
 void vhost_discard_shadow_used(struct vhost_virtqueue *vq, int n);
 int vhost_get_shadow_used_count(struct vhost_virtqueue *vq);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 1c962bfdc3a1..a33e194499cf 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -100,7 +100,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	do {
 		struct virtio_vsock_pkt *pkt;
-		struct vring_used_elem used;
+		struct vhost_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
@@ -148,7 +148,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = vhost32_to_cpu(vq, used.len);
+		len = vhost_get_used_len(&used);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 09/15] vhost: do not export vhost_add_used_n() and vhost_add_used_and_signal_n()
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

We would request device to use shadow used ring API. Then there's no
need for exposing those to device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 13 +++++++------
 drivers/vhost/vhost.h |  4 ----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5bfca5b76b05..50ba382f0981 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2883,8 +2883,9 @@ EXPORT_SYMBOL_GPL(vhost_get_shadow_used_count);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
-		     unsigned count)
+static int vhost_add_used_n(struct vhost_virtqueue *vq,
+			    struct vring_used_elem *heads,
+			    unsigned count)
 {
 	int start, n, r;
 
@@ -2988,14 +2989,14 @@ void vhost_add_used_and_signal(struct vhost_dev *dev,
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 
 /* multi-buffer version of vhost_add_used_and_signal */
-void vhost_add_used_and_signal_n(struct vhost_dev *dev,
-				 struct vhost_virtqueue *vq,
-				 struct vring_used_elem *heads, unsigned count)
+static void vhost_add_used_and_signal_n(struct vhost_dev *dev,
+					struct vhost_virtqueue *vq,
+					struct vring_used_elem *heads,
+					unsigned count)
 {
 	vhost_add_used_n(vq, heads, count);
 	vhost_signal(dev, vq);
 }
-EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
 void vhost_flush_shadow_used_and_signal(struct vhost_virtqueue *vq)
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 175eb5ebf954..481baba20c3d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -240,13 +240,9 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
-		     unsigned count);
 
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
 			       unsigned int id, int len);
-void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *heads, unsigned count);
 
 /* Zerocopy shadow used ring API */
 void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 06/15] vhost_net: switch TX to use shadow used ring API
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

This patch switch to use shadow used ring API for transmission. This
will help to hide used ring layout from device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ac31983d2d77..cf47e6e348f4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -361,22 +361,22 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 {
 	struct vhost_net_virtqueue *nvq =
 		container_of(vq, struct vhost_net_virtqueue, vq);
-	int i, add;
+	int i, add, len;
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+		len = vhost_get_zc_used_len(vq, i);
+		if (len == VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+		if (VHOST_DMA_IS_DONE(len)) {
+			vhost_set_zc_used_len(vq, i, VHOST_DMA_CLEAR_LEN);
 			++j;
 		} else
 			break;
 	}
 	while (j) {
 		add = min(UIO_MAXIOV - nvq->done_idx, j);
-		vhost_add_used_and_signal_n(vq->dev, vq,
-					    &vq->heads[nvq->done_idx], add);
+		vhost_flush_zc_used_and_signal(vq, nvq->done_idx, add);
 		nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
 		j -= add;
 	}
@@ -391,8 +391,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_lock_bh();
 
 	/* set len to mark this desc buffers done DMA */
-	vq->heads[ubuf->desc].len = success ?
-		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
+	vhost_set_zc_used_len(vq, ubuf->desc, success ?
+			      VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN);
 	cnt = vhost_net_ubuf_put(ubufs);
 
 	/*
@@ -480,7 +480,7 @@ static void vhost_tx_batch(struct vhost_net *net,
 	}
 
 signal_used:
-	vhost_net_signal_used(nvq);
+	vhost_flush_shadow_used_and_signal(&nvq->vq);
 	nvq->batched_xdp = 0;
 }
 
@@ -776,7 +776,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 	do {
 		bool busyloop_intr = false;
 
-		if (nvq->done_idx == VHOST_NET_BATCH)
+		if (vhost_get_shadow_used_count(vq) == VHOST_NET_BATCH)
 			vhost_tx_batch(net, nvq, sock, &msg);
 
 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
@@ -835,9 +835,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			pr_debug("Truncated TX packet: len %d != %zd\n",
 				 err, len);
 done:
-		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
-		vq->heads[nvq->done_idx].len = 0;
-		++nvq->done_idx;
+		vhost_add_shadow_used(vq, cpu_to_vhost32(vq, head), 0);
 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 
 	vhost_tx_batch(net, nvq, sock, &msg);
@@ -908,9 +906,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			msg.msg_control = NULL;
 			ubufs = NULL;
 		}
-		vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
-		vq->heads[nvq->upend_idx].len = zcopy_used ?
-			 VHOST_DMA_IN_PROGRESS : VHOST_DMA_DONE_LEN;
+		vhost_set_zc_used(vq, nvq->upend_idx,
+				  cpu_to_vhost32(vq, head),
+				  zcopy_used ? VHOST_DMA_IN_PROGRESS :
+				  VHOST_DMA_DONE_LEN);
 		nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
 		total_len += len;
 		if (tx_can_batch(vq, total_len) &&
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 07/15] vhost_net: calculate last used length once for mergeable buffer
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

This patch tries to calculate last used length once instead of
twice. This can help to convert to use shadow used ring API for
RX.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index cf47e6e348f4..1a67f889cbc1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1065,12 +1065,12 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 		}
 		heads[headcount].id = cpu_to_vhost32(vq, d);
 		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
 		datalen -= len;
+		heads[headcount].len = cpu_to_vhost32(vq,
+				       datalen >= 0 ? len : len + datalen);
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 05/15] vhost: introduce helpers to manipulate shadow used ring
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

We open coding vq->heads[] in net.c for:

1) implementing batching which in fact a shadow used ring
   implementation.
2) maintain pending heads in order which is in fact another kind of
   shadow used ring

But this expose used ring layout for device which makes it hard to
introduce new kind of ring like packed virtqueue. So this patch
introduces two types of shadow used ring API:

1) shadow used ring API for batch updating of used heads
2) zerocopy shadow used API for maintaining pending heads and batch
   updating used heads

This can help to hide the used ring layout from device. Device should
not mix using those two kinds of APIs.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 95 +++++++++++++++++++++++++++++++++++++------
 drivers/vhost/vhost.h | 18 ++++++++
 2 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e781db88dfca..5bfca5b76b05 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -486,6 +486,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 	vhost_reset_vq_maps(vq);
 #endif
+	vq->nheads = 0;
 }
 
 static int vhost_worker(void *data)
@@ -2790,25 +2791,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+static void vhost_withdraw_shadow_used(struct vhost_virtqueue *vq, int count)
 {
-	vq->last_avail_idx -= n;
+	BUG_ON(count > vq->nheads);
+	vq->nheads -= count;
 }
-EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
-/* After we've used one of their buffers, we tell them about it.  We'll then
- * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+/* Reverse the effect of vhost_get_vq_desc and
+ * vhost_add_shadow_used. Useful for error handleing
+ */
+void vhost_discard_shadow_used(struct vhost_virtqueue *vq, int n)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
+	vhost_withdraw_shadow_used(vq, n);
+	vhost_discard_vq_desc(vq, n);
+}
+EXPORT_SYMBOL_GPL(vhost_discard_shadow_used);
 
-	return vhost_add_used_n(vq, &heads, 1);
+/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+{
+	vq->last_avail_idx -= n;
 }
-EXPORT_SYMBOL_GPL(vhost_add_used);
+EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 			    struct vring_used_elem *heads,
@@ -2842,6 +2846,41 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	return 0;
 }
 
+void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
+			       int idx, int len)
+{
+	vq->heads[idx].len = len;
+}
+EXPORT_SYMBOL_GPL(vhost_set_zc_used_len);
+
+int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx)
+{
+	return vq->heads[idx].len;
+}
+EXPORT_SYMBOL_GPL(vhost_get_zc_used_len);
+
+void vhost_set_zc_used(struct vhost_virtqueue *vq,
+			   int idx, unsigned int head, int len)
+{
+	vq->heads[idx].id = head;
+	vq->heads[idx].len = len;
+}
+EXPORT_SYMBOL_GPL(vhost_set_zc_used);
+
+void vhost_add_shadow_used(struct vhost_virtqueue *vq,
+			   unsigned int head, int len)
+{
+	vhost_set_zc_used(vq, vq->nheads, head, len);
+	++vq->nheads;
+}
+EXPORT_SYMBOL_GPL(vhost_add_shadow_used);
+
+int vhost_get_shadow_used_count(struct vhost_virtqueue *vq)
+{
+	return vq->nheads;
+}
+EXPORT_SYMBOL_GPL(vhost_get_shadow_used_count);
+
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
 int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
@@ -2879,6 +2918,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+{
+	struct vring_used_elem heads = {
+		cpu_to_vhost32(vq, head),
+		cpu_to_vhost32(vq, len)
+	};
+
+	return vhost_add_used_n(vq, &heads, 1);
+}
+EXPORT_SYMBOL_GPL(vhost_add_used);
+
 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
 	__u16 old, new;
@@ -2945,6 +2997,23 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
+void vhost_flush_shadow_used_and_signal(struct vhost_virtqueue *vq)
+
+{
+	if (!vq->nheads)
+		return;
+
+	vhost_add_used_and_signal_n(vq->dev, vq, vq->heads, vq->nheads);
+	vq->nheads = 0;
+}
+EXPORT_SYMBOL_GPL(vhost_flush_shadow_used_and_signal);
+
+void vhost_flush_zc_used_and_signal(struct vhost_virtqueue *vq, int idx, int n)
+{
+	vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[idx], n);
+}
+EXPORT_SYMBOL_GPL(vhost_flush_zc_used_and_signal);
+
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e054f178d8b0..175eb5ebf954 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -163,6 +163,7 @@ struct vhost_virtqueue {
 	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
 	struct vring_used_elem *heads;
+	int nheads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
 	struct vhost_umem *iotlb;
@@ -241,10 +242,27 @@ int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
+
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
 			       unsigned int id, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
+
+/* Zerocopy shadow used ring API */
+void vhost_set_zc_used_len(struct vhost_virtqueue *vq,
+			   int idx, int len);
+int vhost_get_zc_used_len(struct vhost_virtqueue *vq, int idx);
+void vhost_flush_zc_used_and_signal(struct vhost_virtqueue *vq, int idx, int n);
+void vhost_set_zc_used(struct vhost_virtqueue *vq, int idx,
+		       unsigned int head, int len);
+
+/* Non zerocopy shadow used ring API */
+void vhost_add_shadow_used(struct vhost_virtqueue *vq, unsigned int head,
+			   int len);
+void vhost_flush_shadow_used_and_signal(struct vhost_virtqueue *vq);
+void vhost_discard_shadow_used(struct vhost_virtqueue *vq, int n);
+int vhost_get_shadow_used_count(struct vhost_virtqueue *vq);
+
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_virtqueue *vq);
 bool vhost_vq_avail_empty(struct vhost_virtqueue *vq);
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 04/15] vhost-net: don't use vhost_add_used_n() for zerocopy
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

We tried to use vhost_add_used_n() for the packets that is not
zero-copied. This can help to mitigate HOL issue but not a total
solution. What's more, it may lead out of order completion and cause
extra complexity for packed virtqueue implementation that needs to
maintain wrap counters.

So this patch switch to constantly use vq->heads[] to maintain
heads. This will ease the introduction of zerocopy shadow used ring
API and reduce the complexity for packed virtqueues.

After this, vhost_net became a in order device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 78d248574f8e..ac31983d2d77 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -894,9 +894,6 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		if (zcopy_used) {
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
-
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
@@ -907,11 +904,14 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			msg.msg_controllen = sizeof(ctl);
 			ubufs = nvq->ubufs;
 			atomic_inc(&ubufs->refcount);
-			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
 		} else {
 			msg.msg_control = NULL;
 			ubufs = NULL;
 		}
+		vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+		vq->heads[nvq->upend_idx].len = zcopy_used ?
+			 VHOST_DMA_IN_PROGRESS : VHOST_DMA_DONE_LEN;
+		nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
 		total_len += len;
 		if (tx_can_batch(vq, total_len) &&
 		    likely(!vhost_exceeds_maxpend(net))) {
@@ -923,11 +923,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(sock, &msg, len);
 		if (unlikely(err < 0)) {
-			if (zcopy_used) {
+			if (zcopy_used)
 				vhost_net_ubuf_put(ubufs);
-				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
-					% UIO_MAXIOV;
-			}
+			nvq->upend_idx = ((unsigned int)nvq->upend_idx - 1)
+					 % UIO_MAXIOV;
 			vhost_discard_vq_desc(vq, 1);
 			vhost_net_enable_vq(net, vq);
 			break;
@@ -935,10 +934,8 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
-		else
-			vhost_zerocopy_signal_used(net, vq);
+
+		vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 }
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 03/15] vhost: remove unnecessary parameter of vhost_enable_notify()/vhost_disable_notify
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

Its dev parameter is not even used, so remove it.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 25 ++++++++++++-------------
 drivers/vhost/scsi.c  | 12 ++++++------
 drivers/vhost/test.c  |  6 +++---
 drivers/vhost/vhost.c |  4 ++--
 drivers/vhost/vhost.h |  4 ++--
 drivers/vhost/vsock.c | 14 +++++++-------
 6 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7d34e8cbc89b..78d248574f8e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -500,8 +500,8 @@ static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
 {
 	if (!vhost_vq_avail_empty(vq)) {
 		vhost_poll_queue(&vq->poll);
-	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-		vhost_disable_notify(&net->dev, vq);
+	} else if (unlikely(vhost_enable_notify(vq))) {
+		vhost_disable_notify(vq);
 		vhost_poll_queue(&vq->poll);
 	}
 }
@@ -524,7 +524,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 	if (!mutex_trylock(&vq->mutex))
 		return;
 
-	vhost_disable_notify(&net->dev, vq);
+	vhost_disable_notify(vq);
 	sock = rvq->private_data;
 
 	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
@@ -552,7 +552,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 	if (poll_rx || sock_has_rx_data(sock))
 		vhost_net_busy_poll_try_queue(net, vq);
 	else if (!poll_rx) /* On tx here, sock has no rx data. */
-		vhost_enable_notify(&net->dev, rvq);
+		vhost_enable_notify(rvq);
 
 	mutex_unlock(&vq->mutex);
 }
@@ -788,9 +788,8 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 		if (head == vq->num) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
-			} else if (unlikely(vhost_enable_notify(&net->dev,
-								vq))) {
-				vhost_disable_notify(&net->dev, vq);
+			} else if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
@@ -880,8 +879,8 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		if (head == vq->num) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
-			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-				vhost_disable_notify(&net->dev, vq);
+			} else if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
@@ -960,7 +959,7 @@ static void handle_tx(struct vhost_net *net)
 	if (!vq_meta_prefetch(vq))
 		goto out;
 
-	vhost_disable_notify(&net->dev, vq);
+	vhost_disable_notify(vq);
 	vhost_net_disable_vq(net, vq);
 
 	if (vhost_sock_zcopy(sock))
@@ -1129,7 +1128,7 @@ static void handle_rx(struct vhost_net *net)
 	if (!vq_meta_prefetch(vq))
 		goto out;
 
-	vhost_disable_notify(&net->dev, vq);
+	vhost_disable_notify(vq);
 	vhost_net_disable_vq(net, vq);
 
 	vhost_hlen = nvq->vhost_hlen;
@@ -1156,10 +1155,10 @@ static void handle_rx(struct vhost_net *net)
 		if (!headcount) {
 			if (unlikely(busyloop_intr)) {
 				vhost_poll_queue(&vq->poll);
-			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+			} else if (unlikely(vhost_enable_notify(vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
-				vhost_disable_notify(&net->dev, vq);
+				vhost_disable_notify(vq);
 				continue;
 			}
 			/* Nothing new?  Wait for eventfd to tell us
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a9caf1bc3c3e..8d4e87007a8d 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -458,7 +458,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	}
 
 again:
-	vhost_disable_notify(&vs->dev, vq);
+	vhost_disable_notify(vq);
 	head = vhost_get_vq_desc(vq, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
@@ -467,7 +467,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 		return;
 	}
 	if (head == vq->num) {
-		if (vhost_enable_notify(&vs->dev, vq))
+		if (vhost_enable_notify(vq))
 			goto again;
 		vs->vs_events_missed = true;
 		return;
@@ -828,8 +828,8 @@ vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
 
 	/* Nothing new?  Wait for eventfd to tell us they refilled. */
 	if (vc->head == vq->num) {
-		if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
-			vhost_disable_notify(&vs->dev, vq);
+		if (unlikely(vhost_enable_notify(vq))) {
+			vhost_disable_notify(vq);
 			ret = -EAGAIN;
 		}
 		goto done;
@@ -936,7 +936,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	memset(&vc, 0, sizeof(vc));
 	vc.rsp_size = sizeof(struct virtio_scsi_cmd_resp);
 
-	vhost_disable_notify(&vs->dev, vq);
+	vhost_disable_notify(vq);
 
 	do {
 		ret = vhost_scsi_get_desc(vs, vq, &vc);
@@ -1189,7 +1189,7 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 
 	memset(&vc, 0, sizeof(vc));
 
-	vhost_disable_notify(&vs->dev, vq);
+	vhost_disable_notify(vq);
 
 	do {
 		ret = vhost_scsi_get_desc(vs, vq, &vc);
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 40589850eb33..746f5d439153 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -50,7 +50,7 @@ static void handle_vq(struct vhost_test *n)
 		return;
 	}
 
-	vhost_disable_notify(&n->dev, vq);
+	vhost_disable_notify(vq);
 
 	for (;;) {
 		head = vhost_get_vq_desc(vq, vq->iov,
@@ -62,8 +62,8 @@ static void handle_vq(struct vhost_test *n)
 			break;
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(&n->dev, vq))) {
-				vhost_disable_notify(&n->dev, vq);
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ec3534bcd51b..e781db88dfca 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2964,7 +2964,7 @@ bool vhost_vq_avail_empty(struct vhost_virtqueue *vq)
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
 /* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+bool vhost_enable_notify(struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
@@ -3002,7 +3002,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
 /* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+void vhost_disable_notify(struct vhost_virtqueue *vq)
 {
 	int r;
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e0451c900177..e054f178d8b0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -246,9 +246,9 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
-void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+void vhost_disable_notify(struct vhost_virtqueue *vq);
 bool vhost_vq_avail_empty(struct vhost_virtqueue *vq);
-bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_enable_notify(struct vhost_virtqueue *vq);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len,
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 814bed72d793..f94021b450f0 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -96,7 +96,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		goto out;
 
 	/* Avoid further vmexits, we're already processing the virtqueue */
-	vhost_disable_notify(&vsock->dev, vq);
+	vhost_disable_notify(vq);
 
 	do {
 		struct virtio_vsock_pkt *pkt;
@@ -109,7 +109,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			vhost_enable_notify(&vsock->dev, vq);
+			vhost_enable_notify(vq);
 			break;
 		}
 
@@ -135,8 +135,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			/* We cannot finish yet if more buffers snuck in while
 			 * re-enabling notify.
 			 */
-			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
-				vhost_disable_notify(&vsock->dev, vq);
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
@@ -369,7 +369,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	if (!vq->private_data)
 		goto out;
 
-	vhost_disable_notify(&vsock->dev, vq);
+	vhost_disable_notify(vq);
 	do {
 		u32 len;
 
@@ -387,8 +387,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			break;
 
 		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
-				vhost_disable_notify(&vsock->dev, vq);
+			if (unlikely(vhost_enable_notify(vq))) {
+				vhost_disable_notify(vq);
 				continue;
 			}
 			break;
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 02/15] vhost: remove the unnecessary parameter of vhost_vq_avail_empty()
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin
In-Reply-To: <20190717105255.63488-1-jasowang@redhat.com>

Its dev parameter is not even used, so remove it.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 8 ++++----
 drivers/vhost/vhost.c | 2 +-
 drivers/vhost/vhost.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 3beb401235c0..7d34e8cbc89b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -498,7 +498,7 @@ static int sock_has_rx_data(struct socket *sock)
 static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
 					  struct vhost_virtqueue *vq)
 {
-	if (!vhost_vq_avail_empty(&net->dev, vq)) {
+	if (!vhost_vq_avail_empty(vq)) {
 		vhost_poll_queue(&vq->poll);
 	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 		vhost_disable_notify(&net->dev, vq);
@@ -540,8 +540,8 @@ static void vhost_net_busy_poll(struct vhost_net *net,
 		}
 
 		if ((sock_has_rx_data(sock) &&
-		     !vhost_vq_avail_empty(&net->dev, rvq)) ||
-		    !vhost_vq_avail_empty(&net->dev, tvq))
+		     !vhost_vq_avail_empty(rvq)) ||
+		    !vhost_vq_avail_empty(tvq))
 			break;
 
 		cpu_relax();
@@ -638,7 +638,7 @@ static int get_tx_bufs(struct vhost_net *net,
 static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
 {
 	return total_len < VHOST_NET_WEIGHT &&
-	       !vhost_vq_avail_empty(vq->dev, vq);
+	       !vhost_vq_avail_empty(vq);
 }
 
 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7f51c74d9aee..ec3534bcd51b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2946,7 +2946,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
 /* return true if we're sure that avaiable ring is empty */
-bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+bool vhost_vq_avail_empty(struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 819296332913..e0451c900177 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -247,7 +247,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
-bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_vq_avail_empty(struct vhost_virtqueue *vq);
 bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 00/15] Packed virtqueue support for vhost
From: Jason Wang @ 2019-07-17 10:52 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, tiwei.bie,
	maxime.coquelin

Hi all:

This series implements packed virtqueues which were described
at [1]. In this version we try to address the performance regression
saw by V2. The root cause is packed virtqueue need more times of
userspace memory accesssing which turns out to be very
expensive. Thanks to the help of 7f466032dc9e ("vhost: access vq
metadata through kernel virtual address"), such overhead cold be
eliminated. So in this version, we can see about 2% improvement for
packed virtqueue on PPS.

More optimizations (e.g IN_ORDER) is on the road.

Please review.

[1] https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-610007

This version were tested with:
- zercopy/datacopy
- mergeable buffer on/off
- TCP stream & virtio-user

Changes from V2:
- rebase on top of vhost metadata accelreation series
- introduce shadow used ring API
- new SET_VRING_BASE/GET_VRING_BASE that takes care about warp counter
  and index for both avail and used
- various twaeaks

Changes from V1:
- drop uapi patch and use Tiwei's
- split the enablement of packed virtqueue into a separate patch

Changes from RFC V5:
- save unnecessary barriers during vhost_add_used_packed_n()
- more compact math for event idx
- fix failure of SET_VRING_BASE when avail_wrap_counter is true
- fix not copy avail_wrap_counter during GET_VRING_BASE
- introduce SET_VRING_USED_BASE/GET_VRING_USED_BASE for syncing
- last_used_idx
- rename used_wrap_counter to last_used_wrap_counter
- rebase to net-next

Changes from RFC V4:
- fix signalled_used index recording
- track avail index correctly
- various minor fixes

Changes from RFC V3:
- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from RFC V2:
- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from RFC V1:
- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (15):
  vhost: simplify meta data pointer accessing
  vhost: remove the unnecessary parameter of vhost_vq_avail_empty()
  vhost: remove unnecessary parameter of
    vhost_enable_notify()/vhost_disable_notify
  vhost-net: don't use vhost_add_used_n() for zerocopy
  vhost: introduce helpers to manipulate shadow used ring
  vhost_net: switch TX to use shadow used ring API
  vhost_net: calculate last used length once for mergeable buffer
  vhost_net: switch to use shadow used ring API for RX
  vhost: do not export vhost_add_used_n() and
    vhost_add_used_and_signal_n()
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost: vhost_put_user() can accept metadata type
  vhost: packed ring support
  vhost: event suppression for packed ring
  vhost: enable packed virtqueues

 drivers/vhost/net.c   |  200 +++---
 drivers/vhost/scsi.c  |   72 +-
 drivers/vhost/test.c  |    6 +-
 drivers/vhost/vhost.c | 1508 +++++++++++++++++++++++++++++++++++------
 drivers/vhost/vhost.h |   78 ++-
 drivers/vhost/vsock.c |   57 +-
 6 files changed, 1513 insertions(+), 408 deletions(-)

-- 
2.18.1


^ permalink raw reply

* Re: [PATCH net] be2net: Signal that the device cannot transmit during reconfiguration
From: Firo Yang @ 2019-07-17 10:25 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: Ajit Khaparde, Sathya Perla, Somnath Kotur, Sriharsha Basavapatna,
	David Miller, Saeed Mahameed, netdev@vger.kernel.org
In-Reply-To: <20190717093208.GA6511@f1>

Crystal clear. Many thanks.

// Firo

^ permalink raw reply

* Re: [PATCH bpf] bpf: fix narrower loads on s390
From: Ilya Leoshkevich @ 2019-07-17 10:36 UTC (permalink / raw)
  To: Y Song; +Cc: bpf, netdev, gor, heiko.carstens
In-Reply-To: <98C6AA13-A44D-4FF1-BA73-1BD446BD773A@linux.ibm.com>

> Am 17.07.2019 um 11:21 schrieb Ilya Leoshkevich <iii@linux.ibm.com>:
> 
>> Am 17.07.2019 um 07:11 schrieb Y Song <ys114321@gmail.com>:
>> 
>> [sorry, resend again as previous one has come text messed out due to
>> networking issues]
>> 
>> On Tue, Jul 16, 2019 at 10:08 PM Y Song <ys114321@gmail.com> wrote:
>>> 
>>> On Tue, Jul 16, 2019 at 4:59 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>> 
>>>> test_pkt_md_access is failing on s390, since the associated eBPF prog
>>>> returns TC_ACT_SHOT, which in turn happens because loading a part of a
>>>> struct __sk_buff field produces an incorrect result.
>>>> 
>>>> The problem is that when verifier emits the code to replace partial load
>>>> of a field with a full load, a shift and a bitwise AND, it assumes that
>>>> the machine is little endian.
>>>> 
>>>> Adjust shift count calculation to account for endianness.
>>>> 
>>>> Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields")
>>>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>>>> ---
>>>> kernel/bpf/verifier.c | 8 ++++++--
>>>> 1 file changed, 6 insertions(+), 2 deletions(-)
>>>> 
>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index 5900cbb966b1..3f9353653558 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -8616,8 +8616,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>>>>               }
>>>> 
>>>>               if (is_narrower_load && size < target_size) {
>>>> -                       u8 shift = (off & (size_default - 1)) * 8;
>>>> -
>>>> +                       u8 load_off = off & (size_default - 1);
>>>> +#ifdef __LITTLE_ENDIAN
>>>> +                       u8 shift = load_off * 8;
>>>> +#else
>>>> +                       u8 shift = (size_default - (load_off + size)) * 8;
>>>> +#endif
>>> 
>> All the values are in register. The shifting operations should be the
>> same for big endian and little endian, e.g., value 64 >> 2 = 16 when
>> value "64" is in register. So I did not see a problem here.
>> 
>> Could you elaborate which field access in test_pkt_md_access
>> caused problem?
> 
> The very first one: TEST_FIELD(__u8,  len, 0xFF);
> 
>> It would be good if you can give detailed memory layout and register values
>> to illustrate the problem.
> 
> Suppose len = 0x11223344. On a big endian system, this would be
> 
> 11 22 33 44
> 
> Now, we would like to do *(u8 *)&len, the desired result is 0x11.
> Verifier should emit the following: ((*(u32 *)&len) >> 24) & 0xff, but as
> of today it misses the shift.
> 
> On a little endian system the layout is:
> 
> 44 33 22 11
> 
> and the desired result is different - 0x44. Verifier correctly emits
> (*(u32 *)&len) & 0xff.

I’ve just realized, that this example does not reflect what the test is
doing on big-endian systems (there is an #ifdef for those).

Here is a better one: len=0x11223344 and we would like to do
((u8 *)&len)[3].

len is represented as `11 22 33 44` in memory, so the desired result is
0x44. It can be obtained by doing (*(u32 *)&len) & 0xff, but today the
verifier does ((*(u32 *)&len) >> 24) & 0xff instead.

^ permalink raw reply

* [PATCH v2] net/mlx5: Replace kfree with kvfree
From: Chuhong Yuan @ 2019-07-17 10:14 UTC (permalink / raw)
  Cc: Saeed Mahameed, Leon Romanovsky, David S . Miller, netdev,
	linux-rdma, linux-kernel, Chuhong Yuan

Variable allocated by kvmalloc should not be freed by kfree.
Because it may be allocated by vmalloc.
So replace kfree with kvfree here.

Fixes: 9b1f298236057 ("net/mlx5: Add support for FW fatal reporter dump")
Signed-off-by: Chuhong Yuan <hslester96@gmail.com>
---
Changes in v2:
  - Add corresponding Fixes tag

 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fe6923f7ce0..9314777d99e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -597,7 +597,7 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter,
 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
 
 free_data:
-	kfree(cr_data);
+	kvfree(cr_data);
 	return err;
 }
 
-- 
2.20.1


^ permalink raw reply related

* Re: KASAN: use-after-free Write in check_noncircular
From: Tetsuo Handa @ 2019-07-17 10:13 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: syzbot, ast, daniel, john.fastabend, linux-kernel, netdev,
	syzkaller-bugs
In-Reply-To: <0000000000001e443b058ddcb128@google.com>

This is not a TOMOYO's bug. But

On 2019/07/17 17:58, syzbot wrote:
> ==================================================================
> BUG: KASAN: use-after-free in check_noncircular+0x91/0x560 kernel/locking/lockdep.c:1722
> Write of size 56 at addr ffff888089815160 by task syz-executor.4/8772
> 
> CPU: 1 PID: 8772 Comm: syz-executor.4 Not tainted 5.2.0+ #31
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
> 

what happened here? No trace was printed to console output?

> Allocated by task 8457:

^ permalink raw reply

* Re: WARNING: held lock freed in nr_release
From: syzbot @ 2019-07-17 10:05 UTC (permalink / raw)
  To: davem, linux-hams, linux-kernel, netdev, ralf, syzkaller-bugs,
	xiyou.wangcong
In-Reply-To: <00000000000015d943058ddcb1b3@google.com>

syzbot has bisected this bug to:

commit c8c8218ec5af5d2598381883acbefbf604e56b5e
Author: Cong Wang <xiyou.wangcong@gmail.com>
Date:   Thu Jun 27 21:30:58 2019 +0000

     netrom: fix a memory leak in nr_rx_frame()

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=14022e8fa00000
start commit:   a5b64700 fix: taprio: Change type of txtime-delay paramete..
git tree:       net
final crash:    https://syzkaller.appspot.com/x/report.txt?x=16022e8fa00000
console output: https://syzkaller.appspot.com/x/log.txt?x=12022e8fa00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=87305c3ca9c25c70
dashboard link: https://syzkaller.appspot.com/bug?extid=a34e5f3d0300163f0c87
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1460b458600000

Reported-by: syzbot+a34e5f3d0300163f0c87@syzkaller.appspotmail.com
Fixes: c8c8218ec5af ("netrom: fix a memory leak in nr_rx_frame()")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH AUTOSEL 5.2 226/249] selftests: bpf: fix inlines in test_lwt_seg6local
From: Jiri Benc @ 2019-07-17  9:43 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, stable, Yonghong Song, Daniel Borkmann,
	linux-kselftest, netdev, bpf, clang-built-linux
In-Reply-To: <20190715134655.4076-226-sashal@kernel.org>

On Mon, 15 Jul 2019 09:46:31 -0400, Sasha Levin wrote:
> From: Jiri Benc <jbenc@redhat.com>
> 
> [ Upstream commit 11aca65ec4db09527d3e9b6b41a0615b7da4386b ]
> 
> Selftests are reporting this failure in test_lwt_seg6local.sh:

I don't think this is critical in any way and I don't think this is a
stable material. How was this selected?

 Jiri

^ permalink raw reply

* Re: [PATCH net] be2net: Signal that the device cannot transmit during reconfiguration
From: Benjamin Poirier @ 2019-07-17  9:32 UTC (permalink / raw)
  To: Firo Yang
  Cc: Ajit Khaparde, Sathya Perla, Somnath Kotur, Sriharsha Basavapatna,
	David Miller, Saeed Mahameed, netdev@vger.kernel.org
In-Reply-To: <CH2PR18MB3189AD09E590F16443D8D5BA88C90@CH2PR18MB3189.namprd18.prod.outlook.com>

On 2019/07/17 17:56, Firo Yang wrote:
> I don't think this change could fix this problem because if SMP, dev_watchdog() could run on a different CPU.

hmm, SMP is clearly part of the picture here. The change I proposed
revolves around the synchronization offered by dev->tx_global_lock:

we have
\ dev_watchdog
	\ netif_tx_lock
		spin_lock(&dev->tx_global_lock);
	...
	\ netif_tx_unlock

and

\ be_update_queues
	\ netif_tx_lock_bh
		\ netif_tx_lock
			spin_lock(&dev->tx_global_lock);

Makes sense?

^ permalink raw reply

* [PATCH] usb: qmi_wwan: add D-Link DWM-222 A2 device ID
From: Rogan Dawes @ 2019-07-17  9:14 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: David S. Miller, netdev, linux-usb
In-Reply-To: <20190717091134.GA5179@lisa.dawes.za.net>

Signed-off-by: Rogan Dawes <rogan@dawes.za.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8b4ad10cf940..69e0a2acfcb0 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1292,6 +1292,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x2001, 0x7e16, 3)},	/* D-Link DWM-221 */
 	{QMI_FIXED_INTF(0x2001, 0x7e19, 4)},	/* D-Link DWM-221 B1 */
 	{QMI_FIXED_INTF(0x2001, 0x7e35, 4)},	/* D-Link DWM-222 */
+	{QMI_FIXED_INTF(0x2001, 0x7e3d, 4)},	/* D-Link DWM-222 A2 */
 	{QMI_FIXED_INTF(0x2020, 0x2031, 4)},	/* Olicard 600 */
 	{QMI_FIXED_INTF(0x2020, 0x2033, 4)},	/* BroadMobi BM806U */
 	{QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)},    /* Sierra Wireless MC7700 */
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH bpf] bpf: fix narrower loads on s390
From: Ilya Leoshkevich @ 2019-07-17  9:21 UTC (permalink / raw)
  To: Y Song; +Cc: bpf, netdev, gor, heiko.carstens
In-Reply-To: <CAH3MdRU-u1Gn6uj2D=mzXvdC2RDWas3Ec0QXObKsLac1GwuREQ@mail.gmail.com>

> Am 17.07.2019 um 07:11 schrieb Y Song <ys114321@gmail.com>:
> 
> [sorry, resend again as previous one has come text messed out due to
> networking issues]
> 
> On Tue, Jul 16, 2019 at 10:08 PM Y Song <ys114321@gmail.com> wrote:
>> 
>> On Tue, Jul 16, 2019 at 4:59 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>> 
>>> test_pkt_md_access is failing on s390, since the associated eBPF prog
>>> returns TC_ACT_SHOT, which in turn happens because loading a part of a
>>> struct __sk_buff field produces an incorrect result.
>>> 
>>> The problem is that when verifier emits the code to replace partial load
>>> of a field with a full load, a shift and a bitwise AND, it assumes that
>>> the machine is little endian.
>>> 
>>> Adjust shift count calculation to account for endianness.
>>> 
>>> Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields")
>>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>>> ---
>>> kernel/bpf/verifier.c | 8 ++++++--
>>> 1 file changed, 6 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index 5900cbb966b1..3f9353653558 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -8616,8 +8616,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>>>                }
>>> 
>>>                if (is_narrower_load && size < target_size) {
>>> -                       u8 shift = (off & (size_default - 1)) * 8;
>>> -
>>> +                       u8 load_off = off & (size_default - 1);
>>> +#ifdef __LITTLE_ENDIAN
>>> +                       u8 shift = load_off * 8;
>>> +#else
>>> +                       u8 shift = (size_default - (load_off + size)) * 8;
>>> +#endif
>> 
> All the values are in register. The shifting operations should be the
> same for big endian and little endian, e.g., value 64 >> 2 = 16 when
> value "64" is in register. So I did not see a problem here.
> 
> Could you elaborate which field access in test_pkt_md_access
> caused problem?

The very first one: TEST_FIELD(__u8,  len, 0xFF);

> It would be good if you can give detailed memory layout and register values
> to illustrate the problem.

Suppose len = 0x11223344. On a big endian system, this would be

11 22 33 44

Now, we would like to do *(u8 *)&len, the desired result is 0x11.
Verifier should emit the following: ((*(u32 *)&len) >> 24) & 0xff, but as
of today it misses the shift.

On a little endian system the layout is:

44 33 22 11

and the desired result is different - 0x44. Verifier correctly emits
(*(u32 *)&len) & 0xff.

> 
>> 
>>>                        if (ctx_field_size <= 4) {
>>>                                if (shift)
>>>                                        insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
>>> --
>>> 2.21.0


^ permalink raw reply

* Re: [PATCH net] be2net: Signal that the device cannot transmit during reconfiguration
From: Firo Yang @ 2019-07-17  8:56 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: Ajit Khaparde, Sathya Perla, Somnath Kotur, Sriharsha Basavapatna,
	David Miller, Saeed Mahameed, netdev@vger.kernel.org
In-Reply-To: <20190717082340.GA6015@f1>

I don't think this change could fix this problem because if SMP, dev_watchdog() could run on a different CPU.

Thanks,
Firo

^ permalink raw reply

* Re: [PATCH bpf] selftests/bpf: make directory prerequisites order-only
From: Ilya Leoshkevich @ 2019-07-17  9:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, bpf, Network Development, Vasily Gorbik,
	Heiko Carstens, Andrii Nakryiko
In-Reply-To: <CAADnVQKzZQ_mbaMHEU6HA-JEy=1jXvBWULg8yKQY_2zwSmU86g@mail.gmail.com>

> Am 16.07.2019 um 19:49 schrieb Alexei Starovoitov <alexei.starovoitov@gmail.com>:
> 
> On Mon, Jul 15, 2019 at 3:22 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> 
>> On 7/12/19 3:56 PM, Ilya Leoshkevich wrote:
>>> When directories are used as prerequisites in Makefiles, they can cause
>>> a lot of unnecessary rebuilds, because a directory is considered changed
>>> whenever a file in this directory is added, removed or modified.
>>> 
>>> If the only thing a target is interested in is the existence of the
>>> directory it depends on, which is the case for selftests/bpf, this
>>> directory should be specified as an order-only prerequisite: it would
>>> still be created in case it does not exist, but it would not trigger a
>>> rebuild of a target in case it's considered changed.
>>> 
>>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>> 
>> Applied, thanks!
> 
> Hi Ilya,
> 
> this commit breaks map_tests.
> To reproduce:
> rm map_tests/tests.h
> make
> tests.h will not be regenerated.
> Please provide a fix asap.
> We cannot ship bpf tree with such failure.

Hi Alexei,

Sorry about this! I actually had the following in my local tree:

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f1f2b82b8fb8..95795cf5805c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -231,7 +231,7 @@ ifeq ($(DWARF2BTF),y)
 endif

 PROG_TESTS_H := $(OUTPUT)/prog_tests/tests.h
-test_progs.c: $(PROG_TESTS_H)
+$(OUTPUT)/test_progs: $(PROG_TESTS_H)
 $(OUTPUT)/test_progs: CFLAGS += $(TEST_PROGS_CFLAGS)
 $(OUTPUT)/test_progs: prog_tests/*.c

@@ -258,7 +258,7 @@ MAP_TESTS_DIR = $(OUTPUT)/map_tests
 $(MAP_TESTS_DIR):
 <------>mkdir -p $@
 MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h
-test_maps.c: $(MAP_TESTS_H)
+$(OUTPUT)/test_maps: $(MAP_TESTS_H)
 $(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS)
 MAP_TESTS_FILES := $(wildcard map_tests/*.c)
 $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR)
@@ -275,7 +275,7 @@ $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR)
 <------><------> ) > $(MAP_TESTS_H))

 VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h
-test_verifier.c: $(VERIFIER_TESTS_H)
+$(OUTPUT)/test_verifier: $(VERIFIER_TESTS_H)
 $(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS)

 VERIFIER_TESTS_DIR = $(OUTPUT)/verifier

but did not realise that this is a pre-requisite for my directories change.
I should have tested it separately, then I would have noticed.

Andrii,
Thanks for helping out and providing the fix!

Best regards,
Ilya

^ permalink raw reply related

* Re: OOM triggered by SCTP
From: Marek Majkowski @ 2019-07-17  9:08 UTC (permalink / raw)
  To: malc; +Cc: vyasevich, nhorman, marcelo.leitner, Linux SCTP, netdev,
	kernel-team
In-Reply-To: <CAPkQJpRJadEqxOcdb_U5Tz6NPE3h3FzootQt3r2GgPP0aYsVvA@mail.gmail.com>

Malc, thanks taking a look.

I'm able to trigger the problem on non-SMP virtme with 4GiB ram, but
I'm not able to trigger it on my SMP host with 16GiB.

The slab info from dmesg (on 4GiB run):
Unreclaimable slab info:
SCTPv6                 31068KB      31068KB
sctp_chunk             24321KB      24990KB
sctp_bind_bucket         972KB        972KB
skbuff_head_cache      28484KB      29051KB
kmalloc-8k                82KB        148KB
kmalloc-4k             81897KB      82943KB
kmalloc-2k               314KB        382KB
kmalloc-1k             27446KB      29547KB
kmalloc-512            30312KB      30915KB

The biggest issue is that the OOM is often unrecoverable:

---[ end Kernel panic - not syncing: System is deadlocked on memory ]---
Out of memory and no killable processes...
Kernel panic - not syncing: System is deadlocked on memory

I noticed sctp_mem toggle. Would tweaking it change anything?
net.sctp.sctp_mem = 80976    107969    161952
net.sctp.sctp_rmem = 4096    865500    3455008
net.sctp.sctp_wmem = 4096    16384    3455008

For the record, stuffing "shutdown(sd, SHUT_RDWR)" before the "close"
doesn't solve the problem.

Marek

On Wed, Jul 17, 2019 at 1:59 AM malc <mlashley@gmail.com> wrote:
>
> On Tue, Jul 16, 2019 at 10:49 PM Marek Majkowski <marek@cloudflare.com> wrote:
> >
> > Morning,
> >
> > My poor man's fuzzer found something interesting in SCTP. It seems
> > like creating large number of SCTP sockets + some magic dance, upsets
> > a memory subsystem related to SCTP. The sequence:
> >
> >  - create SCTP socket
> >  - call setsockopts (SCTP_EVENTS)
> >  - call bind(::1, port)
> >  - call sendmsg(long buffer, MSG_CONFIRM, ::1, port)
> >  - close SCTP socket
> >  - repeat couple thousand times
> >
> > Full code:
> > https://gist.github.com/majek/bd083dae769804d39134ce01f4f802bb#file-test_sctp-c
> >
> > I'm running it on virtme the simplest way:
> > $ virtme-run --show-boot-console --rw --pwd --kimg bzImage --memory
> > 512M --script-sh ./test_sctp
> >
> > Originally I was running it inside net namespace, and just having a
> > localhost interface is sufficient to trigger the problem.
> >
> > Kernel is 5.2.1 (with KASAN and such, but that shouldn't be a factor).
> > In some tests I saw a message that might indicate something funny
> > hitting neighbor table:
> >
> > neighbour: ndisc_cache: neighbor table overflow!
> >
> > I'm not addr-decoding the stack trace, since it seems unrelated to the
> > root cause.
> >
> > Cheers,
> >     Marek
>
> I _think_ this is an 'expected' peculiarity of SCTP on loopback - you
> test_sctp.c ends up creating actual associations to itself on the same
> socket (you can test safely by reducing the port range (say
> 30000-32000) and setting the for-loop-clause to 'run < 1')
> You'll see a bunch of associations established like the following
> (note that I(kernel) was dropping packets for this capture - even with
> /only/ 2000 sockets used...)
>
> $ tshark -r sctp.pcap -Y 'sctp.assoc_index==4'
>   21 0.000409127          ::1 → ::1           SCTP INIT
>   22 0.000436281          ::1 → ::1           SCTP INIT_ACK
>   23 0.000442106          ::1 → ::1           SCTP COOKIE_ECHO
>   24 0.000463007          ::1 → ::1           SCTP COOKIE_ACK DATA
> (Message Fragment)
>                                               presumably your close()
> happens here and we enter SHUTDOWN-PENDING, where we wait for pending
> data to be acknowledged, I'm not convinced that we shouldn't be
> SACK'ing the data from the 'peer' at this point - but for whatever
> reason, we aren't.
>                                               We then run thru
> path-max-retrans, and finally ABORT (the abort indication also shows
> the PMR-exceeded indication in the 'Cause Information')
>   25 0.000476083          ::1 → ::1           SCTP SACK
> 13619 3.017788109          ::1 → ::1           SCTP DATA (retransmission)
> 14022 3.222690889          ::1 → ::1           SCTP SACK
> 18922 21.938217449          ::1 → ::1           SCTP SACK
> 33476 69.831029904          ::1 → ::1           SCTP HEARTBEAT
> 33561 69.831310796          ::1 → ::1           SCTP HEARTBEAT_ACK
> 40816 94.102667600          ::1 → ::1           SCTP SACK
> 40910 95.942741287          ::1 → ::1           SCTP DATA (retransmission)
> 41039 96.152023010          ::1 → ::1           SCTP SACK
> 41100 100.182685237          ::1 → ::1           SCTP SACK
> 41212 108.230746764          ::1 → ::1           SCTP DATA (retransmission)
> 41345 108.439061392          ::1 → ::1           SCTP SACK
> 41407 116.422688507          ::1 → ::1           SCTP HEARTBEAT
> 41413 116.423183124          ::1 → ::1           SCTP HEARTBEAT_ACK
> 41494 124.823749255          ::1 → ::1           SCTP SACK
> 41576 126.663648718          ::1 → ::1           SCTP ABORT
>
> With your entire 512M - you'd only have about 16KB for each of these
> 31K associations tops, I suspect that having a 64KB pending data chunk
> (fragmented ULP msg) for each association for >= 90s is what is
> exhausting memory here - although I'm sure Neil or Michael will be
> along to correct me ;-)
>
> What's interesting - as you reduce the payload size - we end up
> bundling DATA from the 'initiator' side (in COOKIE ECHO) - and
> everything works as expected... (the SACK here is for the bundled DATA
> chunks TSN.
>
> mlashley@duality /tmp $ tshark -r /tmp/sctp_index4_10K.pcap
>    1 0.000000000          ::1 → ::1          SCTP INIT
>    2 0.000014491          ::1 → ::1          SCTP INIT_ACK
>    3 0.000024190          ::1 → ::1          SCTP COOKIE_ECHO DATA
>    4 0.000034833          ::1 → ::1          SCTP COOKIE_ACK
>    5 0.000040646          ::1 → ::1          SCTP SACK
>    6 0.000050287          ::1 → ::1          SCTP ABORT
>
> In short - the SCTP associations /can/ persist after user-space calls
> close() whilst there is outstanding data (for path.max.retrans *
> rto-with-doubling[due to T3-rtx expiry])
>
> (My tests on 5.2.0 as it is what I had to hand...)
>
> Cheers,
> malc.

^ permalink raw reply

* Re: [PATCH 1/2] net/macb: bindings doc: add sifive fu540-c000 binding
From: Yash Shah @ 2019-07-17  9:07 UTC (permalink / raw)
  To: Nicolas Ferre
  Cc: Rob Herring, David Miller, netdev,
	linux-kernel@vger.kernel.org List, linux-riscv, devicetree,
	Mark Rutland, Palmer Dabbelt, Albert Ou, Petr Štetiar,
	Paul Walmsley, Sachin Ghadi
In-Reply-To: <b0c60ec9-2f57-c3f5-c3b4-ee83a5ec4c45@microchip.com>

On Mon, Jun 24, 2019 at 9:08 PM <Nicolas.Ferre@microchip.com> wrote:
>
> On 23/05/2019 at 22:50, Rob Herring wrote:
> > On Thu, May 23, 2019 at 6:46 AM Yash Shah <yash.shah@sifive.com> wrote:
> >>
> >> Add the compatibility string documentation for SiFive FU540-C0000
> >> interface.
> >> On the FU540, this driver also needs to read and write registers in a
> >> management IP block that monitors or drives boundary signals for the
> >> GEMGXL IP block that are not directly mapped to GEMGXL registers.
> >> Therefore, add additional range to "reg" property for SiFive GEMGXL
> >> management IP registers.
> >>
> >> Signed-off-by: Yash Shah <yash.shah@sifive.com>
> >> ---
> >>   Documentation/devicetree/bindings/net/macb.txt | 3 +++
> >>   1 file changed, 3 insertions(+)
> >>
> >> diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
> >> index 9c5e944..91a2a66 100644
> >> --- a/Documentation/devicetree/bindings/net/macb.txt
> >> +++ b/Documentation/devicetree/bindings/net/macb.txt
> >> @@ -4,6 +4,7 @@ Required properties:
> >>   - compatible: Should be "cdns,[<chip>-]{macb|gem}"
> >>     Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC.
> >>     Use "cdns,at91sam9260-macb" for Atmel at91sam9 SoCs.
> >> +  Use "cdns,fu540-macb" for SiFive FU540-C000 SoC.
> >
> > This pattern that Atmel started isn't really correct. The vendor
> > prefix here should be sifive. 'cdns' would be appropriate for a
> > fallback.
>
> Ok, we missed this for the sam9x60 SoC that we added recently then.
>
> Anyway a little too late, coming back to this machine, and talking to
> Yash, isn't "sifive,fu540-c000-macb" more specific and a better match
> for being future proof? I would advice for the most specific possible
> with other compatible strings on the same line in the DT, like:
>
> "sifive,fu540-c000-macb", "sifive,fu540-macb"
>

Yes, I agree that "sifive,fu540-c000-macb" is a better match.

> Moreover, is it really a "macb" or a "gem" type of interface from
> Cadence? Not a big deal, but just to discuss the topic to the bone...

I believe it should be "gem". I will plan to submit the patch for
these changes. Thanks for pointing it out.

- Yash

>
> Note that I'm fine if you consider that what you have in net-next new is
> correct.
>
> Regards,
>    Nicolas
>
> >>     Use "cdns,sam9x60-macb" for Microchip sam9x60 SoC.
> >>     Use "cdns,np4-macb" for NP4 SoC devices.
> >>     Use "cdns,at32ap7000-macb" for other 10/100 usage or use the generic form: "cdns,macb".
> >> @@ -17,6 +18,8 @@ Required properties:
> >>     Use "cdns,zynqmp-gem" for Zynq Ultrascale+ MPSoC.
> >>     Or the generic form: "cdns,emac".
> >>   - reg: Address and length of the register set for the device
> >> +       For "cdns,fu540-macb", second range is required to specify the
> >> +       address and length of the registers for GEMGXL Management block.
> >>   - interrupts: Should contain macb interrupt
> >>   - phy-mode: See ethernet.txt file in the same directory.
> >>   - clock-names: Tuple listing input clock names.
> >> --
> >> 1.9.1
> >>
> >
>
>
> --
> Nicolas Ferre

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox