* [RFC PATCH V2 5/8] vhost: vhost_put_user() can accept metadata type
From: Jason Wang @ 2018-03-26 3:38 UTC (permalink / raw)
To: mst, virtualization; +Cc: netdev, linux-kernel, kvm
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 65954d6..dcac4d4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -847,7 +847,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
}
-#define vhost_put_user(vq, x, ptr) \
+#define vhost_put_user(vq, x, ptr, type) \
({ \
int ret = -EFAULT; \
if (!vq->iotlb) { \
@@ -855,7 +855,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
} else { \
__typeof__(ptr) to = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
+ sizeof(*ptr), type); \
if (to != NULL) \
ret = __put_user(x, to); \
else \
@@ -1716,7 +1716,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
{
void __user *used;
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
- &vq->used->flags) < 0)
+ &vq->used->flags, VHOST_ADDR_USED) < 0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1735,7 +1735,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
- vhost_avail_event(vq)))
+ vhost_avail_event(vq), VHOST_ADDR_USED))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -2218,12 +2218,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
used = vq->used->ring + start;
for (i = 0; i < count; i++) {
if (unlikely(vhost_put_user(vq, heads[i].elem.id,
- &used[i].id))) {
+ &used[i].id, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
if (unlikely(vhost_put_user(vq, heads[i].elem.len,
- &used[i].len))) {
+ &used[i].len, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
@@ -2269,7 +2269,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
- &vq->used->idx)) {
+ &vq->used->idx, VHOST_ADDR_USED)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
--
2.7.4
^ permalink raw reply related
* [RFC PATCH V2 6/8] virtio: introduce packed ring defines
From: Jason Wang @ 2018-03-26 3:38 UTC (permalink / raw)
To: mst, virtualization; +Cc: netdev, linux-kernel, kvm
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/uapi/linux/virtio_config.h | 9 +++++++++
include/uapi/linux/virtio_ring.h | 13 +++++++++++++
2 files changed, 22 insertions(+)
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e209..5903d51 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -71,4 +71,13 @@
* this is for compatibility with legacy systems.
*/
#define VIRTIO_F_IOMMU_PLATFORM 33
+
+#define VIRTIO_F_RING_PACKED 34
+
+/*
+ * This feature indicates that all buffers are used by the device in
+ * the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER 35
+
#endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..e297580 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
#define VRING_DESC_F_WRITE 2
/* This means the buffer contains a list of buffer descriptors. */
#define VRING_DESC_F_INDIRECT 4
+#define VRING_DESC_F_AVAIL 7
+#define VRING_DESC_F_USED 15
/* The Host uses this in used->flags to advise the Guest: don't kick me when
* you add a buffer. It's unreliable, so it's simply an optimization. Guest
@@ -62,6 +64,17 @@
* at the end of the used ring. Guest should ignore the used->flags field. */
#define VIRTIO_RING_F_EVENT_IDX 29
+struct vring_desc_packed {
+ /* Buffer Address. */
+ __virtio64 addr;
+ /* Buffer Length. */
+ __virtio32 len;
+ /* Buffer ID. */
+ __virtio16 id;
+ /* The flags depending on descriptor type. */
+ __virtio16 flags;
+};
+
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
struct vring_desc {
/* Address (guest-physical). */
--
2.7.4
^ permalink raw reply related
* [RFC PATCH V2 7/8] vhost: packed ring support
From: Jason Wang @ 2018-03-26 3:38 UTC (permalink / raw)
To: mst, virtualization; +Cc: netdev, linux-kernel, kvm
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 5 +-
drivers/vhost/vhost.c | 530 ++++++++++++++++++++++++++++++++++++++++++++++----
drivers/vhost/vhost.h | 7 +-
3 files changed, 505 insertions(+), 37 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7be8b55..84905d5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -67,7 +67,8 @@ enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VIRTIO_F_IOMMU_PLATFORM)
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_F_RING_PACKED)
};
enum {
@@ -706,6 +707,8 @@ static void handle_rx(struct vhost_net *net)
vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+ /* FIXME: workaround for current dpdk prototype */
+ mergeable = false;
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
sock_len += sock_hlen;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dcac4d4..6177e4d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,6 +324,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
vq->busyloop_timeout = 0;
+ vq->used_wrap_counter = true;
vq->umem = NULL;
vq->iotlb = NULL;
__vhost_vq_meta_reset(vq);
@@ -1136,10 +1137,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
return 0;
}
-static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
- struct vring_desc __user *desc,
- struct vring_avail __user *avail,
- struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+
+ /* FIXME: check device area and driver area */
+ return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
+ access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+}
+
+static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
@@ -1151,6 +1164,17 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
sizeof *used + num * sizeof *used->ring + s);
}
+static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vq_access_ok_packed(vq, num, desc, avail, used);
+ else
+ return vq_access_ok_split(vq, num, desc, avail, used);
+}
+
static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
const struct vhost_umem_node *node,
int type)
@@ -1763,6 +1787,9 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
vhost_init_is_le(vq);
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return 0;
+
r = vhost_update_used_flags(vq);
if (r)
goto err;
@@ -1836,7 +1863,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
/* Each buffer in the virtqueues is actually a chain of descriptors. This
* function returns the next descriptor in the chain,
* or -1U if we're at the end. */
-static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
+static unsigned next_desc_split(struct vhost_virtqueue *vq,
+ struct vring_desc *desc)
{
unsigned int next;
@@ -1849,11 +1877,17 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
return next;
}
-static int get_indirect(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num,
- struct vring_desc *indirect)
+static unsigned next_desc_packed(struct vhost_virtqueue *vq,
+ struct vring_desc_packed *desc)
+{
+ return desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT);
+}
+
+static int get_indirect_split(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ struct vring_desc *indirect)
{
struct vring_desc desc;
unsigned int i = 0, count, found = 0;
@@ -1943,23 +1977,274 @@ static int get_indirect(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
- } while ((i = next_desc(vq, &desc)) != -1);
+ } while ((i = next_desc_split(vq, &desc)) != -1);
return 0;
}
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct vhost_used_elem *used,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+static int get_indirect_packed(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ struct vring_desc_packed *indirect)
+{
+ struct vring_desc_packed desc;
+ unsigned int i = 0, count, found = 0;
+ u32 len = vhost32_to_cpu(vq, indirect->len);
+ struct iov_iter from;
+ int ret, access;
+
+ /* Sanity check */
+ if (unlikely(len % sizeof(desc))) {
+ vq_err(vq, "Invalid length in indirect descriptor: "
+ "len 0x%llx not multiple of 0x%zx\n",
+ (unsigned long long)len,
+ sizeof desc);
+ return -EINVAL;
+ }
+
+ ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr),
+ len, vq->indirect,
+ UIO_MAXIOV, VHOST_ACCESS_RO);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d in indirect.\n",
+ ret);
+ return ret;
+ }
+ iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+ /* We will use the result as an address to read from, so most
+ * architectures only need a compiler barrier here. */
+ read_barrier_depends();
+
+ count = len / sizeof desc;
+ /* Buffers are chained via a 16 bit next field, so
+ * we can have at most 2^16 of these. */
+ if (unlikely(count > USHRT_MAX + 1)) {
+ vq_err(vq, "Indirect buffer length too big: %d\n",
+ indirect->len);
+ return -E2BIG;
+ }
+
+ do {
+ unsigned iov_count = *in_num + *out_num;
+ if (unlikely(++found > count)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "indirect size %u\n",
+ i, count);
+ return -EINVAL;
+ }
+ if (unlikely(!copy_from_iter_full(&desc, sizeof(desc),
+ &from))) {
+ vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+ i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+ + i * sizeof desc);
+ return -EINVAL;
+ }
+ if (unlikely(desc.flags &
+ cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+ vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+ i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+ + i * sizeof desc);
+ return -EINVAL;
+ }
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+
+ ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+ vhost32_to_cpu(vq, desc.len),
+ iov + iov_count,
+ iov_size - iov_count, access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d "
+ "indirect idx %d\n",
+ ret, i);
+ return ret;
+ }
+ /* If this is an input descriptor, increment that count. */
+ if (access == VHOST_ACCESS_WO) {
+ *in_num += ret;
+ if (unlikely(log)) {
+ log[*log_num].addr =
+ vhost64_to_cpu(vq, desc.addr);
+ log[*log_num].len =
+ vhost32_to_cpu(vq, desc.len);
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Indirect descriptor "
+ "has out after in: idx %d\n", i);
+ return -EINVAL;
+ }
+ *out_num += ret;
+ }
+ } while ((i = next_desc_packed(vq, &desc)) != -1);
+ return 0;
+}
+
+#define DESC_AVAIL (1 << VRING_DESC_F_AVAIL)
+#define DESC_USED (1 << VRING_DESC_F_USED)
+static bool desc_is_avail(struct vhost_virtqueue *vq, __virtio16 flags)
+{
+ if (vq->used_wrap_counter)
+ if ((flags & cpu_to_vhost16(vq, DESC_AVAIL)) &&
+ !(flags & cpu_to_vhost16(vq, DESC_USED)))
+ return true;
+ if (vq->used_wrap_counter == false)
+ if (!(flags & cpu_to_vhost16(vq, DESC_AVAIL)) &&
+ (flags & cpu_to_vhost16(vq, DESC_USED)))
+ return true;
+
+ return false;
+}
+
+static __virtio16 get_desc_flags(struct vhost_virtqueue *vq)
+{
+ __virtio16 flags = 0;
+
+ if (vq->used_wrap_counter) {
+ flags |= cpu_to_vhost16(vq, DESC_AVAIL);
+ flags |= cpu_to_vhost16(vq, DESC_USED);
+ } else {
+ flags &= ~cpu_to_vhost16(vq, DESC_AVAIL);
+ flags &= ~cpu_to_vhost16(vq, DESC_USED);
+ }
+
+ return flags;
+}
+
+static int vhost_get_vq_desc_packed(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log,
+ unsigned int *log_num)
+{
+ struct vring_desc_packed desc;
+ int ret, access, i;
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+ if (unlikely(log))
+ *log_num = 0;
+
+ used->count = 0;
+
+ do {
+ struct vring_desc_packed *d = vq->desc_packed +
+ vq->last_avail_idx;
+ unsigned int iov_count = *in_num + *out_num;
+
+ ret = vhost_get_user(vq, desc.flags, &d->flags,
+ VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+ vq->last_avail_idx, &d->flags);
+ return -EFAULT;
+ }
+
+ if (!desc_is_avail(vq, desc.flags)) {
+ /* If there's nothing new since last we looked, return
+ * invalid.
+ */
+ if (!used->count)
+ return -ENOSPC;
+ break;
+ }
+
+ /* Read desc content after we're sure it was available. */
+ smp_rmb();
+
+ ret = vhost_copy_from_user(vq, &desc, d, sizeof(desc));
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ vq->last_avail_idx, d);
+ return -EFAULT;
+ }
+
+ if (used->count == 0)
+ used->elem.id = desc.id;
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
+ ret = get_indirect_packed(vq, iov, iov_size,
+ out_num, in_num, log,
+ log_num, &desc);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Failure detected "
+ "in indirect descriptor "
+ "at idx %d\n", i);
+ return ret;
+ }
+ goto next;
+ }
+
+ if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+ ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+ vhost32_to_cpu(vq, desc.len),
+ iov + iov_count, iov_size - iov_count,
+ access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d idx %d\n",
+ ret, i);
+ return ret;
+ }
+
+ if (access == VHOST_ACCESS_WO) {
+ /* If this is an input descriptor,
+ * increment that count.
+ */
+ *in_num += ret;
+ if (unlikely(log)) {
+ log[*log_num].addr =
+ vhost64_to_cpu(vq, desc.addr);
+ log[*log_num].len =
+ vhost32_to_cpu(vq, desc.len);
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors.
+ */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Desc out after in: idx %d\n",
+ i);
+ return -EINVAL;
+ }
+ *out_num += ret;
+ }
+
+next:
+ if (unlikely(++used->count > vq->num)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "vq size %u head %u\n",
+ i, vq->num, used->elem.id);
+ return -EINVAL;
+ }
+ if (++vq->last_avail_idx >= vq->num)
+ vq->last_avail_idx = 0;
+ /* If this descriptor says it doesn't chain, we're done. */
+ } while (next_desc_packed(vq, &desc));
+
+ return 0;
+}
+
+static int vhost_get_vq_desc_split(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
{
struct vring_desc desc;
unsigned int i, head, found = 0;
@@ -2044,9 +2329,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return -EFAULT;
}
if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
- ret = get_indirect(vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
+ ret = get_indirect_split(vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, &desc);
if (unlikely(ret < 0)) {
if (ret != -EAGAIN)
vq_err(vq, "Failure detected "
@@ -2088,7 +2373,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
- } while ((i = next_desc(vq, &desc)) != -1);
+ } while ((i = next_desc_split(vq, &desc)) != -1);
/* On success, increment avail index. */
vq->last_avail_idx++;
@@ -2098,6 +2383,31 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
return 0;
}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *used,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_get_vq_desc_packed(vq, used, iov, iov_size,
+ out_num, in_num,
+ log, log_num);
+ else
+ return vhost_get_vq_desc_split(vq, used, iov, iov_size,
+ out_num, in_num,
+ log, log_num);
+}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
void vhost_set_used_len(struct vhost_virtqueue *vq,
@@ -2248,10 +2558,69 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}
+static int vhost_add_used_n_packed(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned int count)
+{
+ struct vring_desc_packed __user *desc;
+ int i, ret;
+
+ for (i = 0; i < count; i++) {
+ desc = vq->desc_packed + vq->last_used_idx;
+
+ ret = vhost_put_user(vq, heads[i].elem.id, &desc->id,
+ VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update id: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+ ret = vhost_put_user(vq, heads[i].elem.len, &desc->len,
+ VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update len: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+
+ /* Update flags after descriptor id and len is wrote,
+ * TODO: Update head flags at last for saving barriers */
+ smp_wmb();
+
+ ret = vhost_put_user(vq, get_desc_flags(vq), &desc->flags,
+ VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to update flags: idx %d addr %p\n",
+ vq->last_used_idx, desc);
+ return -EFAULT;
+ }
+
+ if (unlikely(vq->log_used)) {
+ /* Make sure desc is written before update log. */
+ smp_wmb();
+ log_write(vq->log_base, vq->log_addr +
+ vq->last_used_idx * sizeof(*desc),
+ sizeof(*desc));
+ if (vq->log_ctx)
+ eventfd_signal(vq->log_ctx, 1);
+ }
+
+ vq->last_used_idx += heads[i].count;
+ if (vq->last_used_idx >= vq->num) {
+ vq->used_wrap_counter ^= 1;
+ vq->last_used_idx -= vq->num;
+ }
+ }
+
+ return 0;
+}
+
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
- unsigned count)
+static int vhost_add_used_n_split(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned count)
+
{
int start, n, r;
@@ -2283,6 +2652,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
}
return r;
}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd.
+ */
+int vhost_add_used_n(struct vhost_virtqueue *vq,
+ struct vhost_used_elem *heads,
+ unsigned int count)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_add_used_n_packed(vq, heads, count);
+ else
+ return vhost_add_used_n_split(vq, heads, count);
+}
EXPORT_SYMBOL_GPL(vhost_add_used_n);
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
@@ -2290,6 +2672,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
__u16 old, new;
__virtio16 event;
bool v;
+
+ /* FIXME: check driver area */
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return false;
+
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
* interrupts. */
@@ -2352,7 +2739,8 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
/* return true if we're sure that avaiable ring is empty */
-bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_vq_avail_empty_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__virtio16 avail_idx;
int r;
@@ -2367,10 +2755,58 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vq->avail_idx == vq->last_avail_idx;
}
+
+static bool vhost_vq_avail_empty_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+ __virtio16 flags;
+ int ret;
+
+ ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+ vq->last_avail_idx, d);
+ return -EFAULT;
+ }
+
+ return desc_is_avail(vq, flags);
+}
+
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_vq_avail_empty_packed(dev, vq);
+ else
+ return vhost_vq_avail_empty_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
-/* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_enable_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+ __virtio16 flags;
+ int ret;
+
+ /* FIXME: disable notification through device area */
+
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again. */
+ smp_mb();
+
+ ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ vq->last_avail_idx, &d->flags);
+ return -EFAULT;
+ }
+
+ return desc_is_avail(vq, flags);
+}
+
+static bool vhost_enable_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__virtio16 avail_idx;
int r;
@@ -2405,10 +2841,25 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
}
+
+/* OK, now we need to know about added descriptors. */
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_enable_notify_packed(dev, vq);
+ else
+ return vhost_enable_notify_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_enable_notify);
-/* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static void vhost_disable_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ /* FIXME: disable notification through device area */
+}
+
+static void vhost_disable_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
int r;
@@ -2422,6 +2873,15 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
&vq->used->flags, r);
}
}
+
+/* We don't need to be notified again. */
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_disable_notify_packed(dev, vq);
+ else
+ return vhost_disable_notify_split(dev, vq);
+}
EXPORT_SYMBOL_GPL(vhost_disable_notify);
/* Create a new message. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d57c875..8a9df4f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -36,6 +36,7 @@ struct vhost_poll {
struct vhost_used_elem {
struct vring_used_elem elem;
+ int count;
};
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
@@ -91,7 +92,10 @@ struct vhost_virtqueue {
/* The actual ring of buffers. */
struct mutex mutex;
unsigned int num;
- struct vring_desc __user *desc;
+ union {
+ struct vring_desc __user *desc;
+ struct vring_desc_packed __user *desc_packed;
+ };
struct vring_avail __user *avail;
struct vring_used __user *used;
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
@@ -148,6 +152,7 @@ struct vhost_virtqueue {
bool user_be;
#endif
u32 busyloop_timeout;
+ bool used_wrap_counter;
};
struct vhost_msg_node {
--
2.7.4
^ permalink raw reply related
* [RFC PATCH V2 8/8] vhost: event suppression for packed ring
From: Jason Wang @ 2018-03-26 3:38 UTC (permalink / raw)
To: mst, virtualization; +Cc: netdev, linux-kernel, kvm
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
This patch introduces basic support for event suppression aka driver
and device area. Compile tested only.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 169 ++++++++++++++++++++++++++++++++++++---
drivers/vhost/vhost.h | 10 ++-
include/uapi/linux/virtio_ring.h | 19 +++++
3 files changed, 183 insertions(+), 15 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6177e4d..ff83a2e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1143,10 +1143,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
struct vring_used __user *used)
{
struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+ struct vring_packed_desc_event *driver_event =
+ (struct vring_packed_desc_event *)avail;
+ struct vring_packed_desc_event *device_event =
+ (struct vring_packed_desc_event *)used;
- /* FIXME: check device area and driver area */
return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
- access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+ access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+ access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+ access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
}
static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1222,14 +1227,27 @@ static int iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
}
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+ int num = vq->num;
+
+ return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+ num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+ num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_RO,
+ (u64)(uintptr_t)vq->driver_event,
+ sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+ iotlb_access_ok(vq, VHOST_ACCESS_WO,
+ (u64)(uintptr_t)vq->device_event,
+ sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
{
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
- if (!vq->iotlb)
- return 1;
-
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1241,6 +1259,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
num * sizeof(*vq->used->ring) + s,
VHOST_ADDR_USED);
}
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+ if (!vq->iotlb)
+ return 1;
+
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vq_iotlb_prefetch_packed(vq);
+ else
+ return vq_iotlb_prefetch_split(vq);
+}
EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
/* Can we log writes? */
@@ -1756,6 +1785,29 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
return 0;
}
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+ __virtio16 device_flags)
+{
+ void __user *flags;
+
+ if (vhost_put_user(vq, cpu_to_vhost16(vq, device_flags),
+ &vq->device_event->desc_event_flags,
+ VHOST_ADDR_DESC) < 0)
+ return -EFAULT;
+ if (unlikely(vq->log_used)) {
+ /* Make sure the flag is seen before log. */
+ smp_wmb();
+ /* Log used flag write. */
+ flags = &vq->device_event->desc_event_flags;
+ log_write(vq->log_base, vq->log_addr +
+ (flags - (void __user *)vq->device_event),
+ sizeof(vq->used->flags));
+ if (vq->log_ctx)
+ eventfd_signal(vq->log_ctx, 1);
+ }
+ return 0;
+}
+
static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
{
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
@@ -2667,16 +2719,13 @@ int vhost_add_used_n(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_add_used_n);
-static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify_split(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
{
__u16 old, new;
__virtio16 event;
bool v;
- /* FIXME: check driver area */
- if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
- return false;
-
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
* interrupts. */
@@ -2709,6 +2758,79 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
return vring_need_event(vhost16_to_cpu(vq, event), new, old);
}
+static bool vhost_idx_diff(struct vhost_virtqueue *vq, u16 old, u16 new)
+{
+ if (new > old)
+ return new - old;
+ return (new + vq->num - old);
+}
+
+static bool vhost_vring_packed_need_event(struct vhost_virtqueue *vq,
+ __u16 event_off, __u16 new,
+ __u16 old)
+{
+ return (__u16)(vhost_idx_diff(vq, new, event_off) - 1) <
+ (__u16)vhost_idx_diff(vq, new, old);
+}
+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq)
+{
+ __virtio16 event_off_wrap, event_flags;
+ __u16 old, new;
+ bool v, wrap;
+ int off;
+
+ /* Flush out used descriptors updates. This is paired
+ * with the barrier that the Guest executes when enabling
+ * interrupts.
+ */
+ smp_mb();
+
+ if (vhost_get_avail(vq, event_flags,
+ &vq->driver_event->desc_event_flags) < 0) {
+ vq_err(vq, "Failed to get driver desc_event_flags");
+ return true;
+ }
+
+ if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
+ return event_flags ==
+ cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+
+ /* Read desc event flags before event_off and event_wrap */
+ smp_rmb();
+
+ if (vhost_get_avail(vq, event_off_wrap,
+ &vq->driver_event->desc_event_off_warp) < 0) {
+ vq_err(vq, "Failed to get driver desc_event_off/wrap");
+ return true;
+ }
+
+ off = vhost16_to_cpu(vq, event_off_wrap);
+
+ wrap = off & 0x1;
+ off >>= 1;
+
+ old = vq->signalled_used;
+ v = vq->signalled_used_valid;
+ new = vq->signalled_used = vq->last_used_idx;
+ vq->signalled_used_valid = true;
+
+ if (unlikely(!v))
+ return true;
+
+ return vhost_vring_packed_need_event(vq, new, old, off) &&
+ wrap == vq->used_wrap_counter;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+ return vhost_notify_packed(dev, vq);
+ else
+ return vhost_notify_split(dev, vq);
+}
+
/* This actually signals the guest, using eventfd. */
void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
@@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev *dev,
__virtio16 flags;
int ret;
- /* FIXME: disable notification through device area */
+ if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+ return false;
+ vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+ flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+ ret = vhost_update_device_flags(vq, flags);
+ if (ret) {
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->device_event->desc_event_flags, ret);
+ return false;
+ }
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again. */
@@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
static void vhost_disable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
- /* FIXME: disable notification through device area */
+ __virtio16 flags;
+ int r;
+
+ if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+ return;
+ vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+ flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+ r = vhost_update_device_flags(vq, flags);
+ if (r)
+ vq_err(vq, "Failed to enable notification at %p: %d\n",
+ &vq->device_event->desc_event_flags, r);
}
static void vhost_disable_notify_split(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8a9df4f..02d7a36 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_desc_packed __user *desc_packed;
};
- struct vring_avail __user *avail;
- struct vring_used __user *used;
+ union {
+ struct vring_avail __user *avail;
+ struct vring_packed_desc_event __user *driver_event;
+ };
+ union {
+ struct vring_used __user *used;
+ struct vring_packed_desc_event __user *device_event;
+ };
const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
struct file *kick;
struct eventfd_ctx *call_ctx;
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index e297580..7cdbf06 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -75,6 +75,25 @@ struct vring_desc_packed {
__virtio16 flags;
};
+/* Enable events */
+#define RING_EVENT_FLAGS_ENABLE 0x0
+/* Disable events */
+#define RING_EVENT_FLAGS_DISABLE 0x1
+/*
+ * Enable events for a specific descriptor
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
+ */
+#define RING_EVENT_FLAGS_DESC 0x2
+/* The value 0x3 is reserved */
+
+struct vring_packed_desc_event {
+ /* Descriptor Ring Change Event Offset and Wrap Counter */
+ __virtio16 desc_event_off_warp;
+ /* Descriptor Ring Change Event Flags */
+ __virtio16 desc_event_flags;
+};
+
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
struct vring_desc {
/* Address (guest-physical). */
--
2.7.4
^ permalink raw reply related
* Re: [RFC PATCH V2 0/8] Packed ring for vhost
From: Jason Wang @ 2018-03-26 3:44 UTC (permalink / raw)
To: mst, virtualization; +Cc: kvm, netdev, linux-kernel, Wei Xu
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
cc Jens, Tiwei and Wei
Thanks
On 2018年03月26日 11:38, Jason Wang wrote:
> Hi all:
>
> This RFC implement packed ring layout. The code were tested with pmd
> implement by Jens at
> http://dpdk.org/ml/archives/dev/2018-January/089417.html. Minor change
> was needed for pmd codes to kick virtqueue since it assumes a busy
> polling backend.
>
> Test were done between localhost and guest. Testpmd (rxonly) in guest
> reports 2.4Mpps. Testpmd (txonly) repots about 2.1Mpps.
>
> Notes: The event suppression /indirect descriptor support is complied
> test only because of lacked driver support.
>
> Changes from V1:
>
> - Refactor vhost used elem code to avoid open coding on used elem
> - Event suppression support (compile test only).
> - Indirect descriptor support (compile test only).
> - Zerocopy support.
> - vIOMMU support.
> - SCSI/VSOCK support (compile test only).
> - Fix several bugs
>
> For simplicity, I don't implement batching or other optimizations.
>
> Please review.
>
> Thanks
>
> Jason Wang (8):
> vhost: move get_rx_bufs to vhost.c
> vhost: hide used ring layout from device
> vhost: do not use vring_used_elem
> vhost_net: do not explicitly manipulate vhost_used_elem
> vhost: vhost_put_user() can accept metadata type
> virtio: introduce packed ring defines
> vhost: packed ring support
> vhost: event suppression for packed ring
>
> drivers/vhost/net.c | 138 ++-----
> drivers/vhost/scsi.c | 62 +--
> drivers/vhost/vhost.c | 818 ++++++++++++++++++++++++++++++++++---
> drivers/vhost/vhost.h | 46 ++-
> drivers/vhost/vsock.c | 42 +-
> include/uapi/linux/virtio_config.h | 9 +
> include/uapi/linux/virtio_ring.h | 32 ++
> 7 files changed, 921 insertions(+), 226 deletions(-)
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: possible deadlock in handle_rx
From: Jason Wang @ 2018-03-26 3:48 UTC (permalink / raw)
To: syzbot, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
virtualization
In-Reply-To: <001a113fe6d043b2a605684578f4@google.com>
On 2018年03月26日 08:01, syzbot wrote:
> Hello,
>
> syzbot hit the following crash on upstream commit
> cb6416592bc2a8b731dabcec0d63cda270764fc6 (Sun Mar 25 17:45:10 2018 +0000)
> Merge tag 'dmaengine-fix-4.16-rc7' of
> git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/slave-dma
> syzbot dashboard link:
> https://syzkaller.appspot.com/bug?extid=7f073540b1384a614e09
>
> So far this crash happened 4 times on upstream.
> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6506789075943424
> syzkaller reproducer:
> https://syzkaller.appspot.com/x/repro.syz?id=5716250550337536
> Raw console output:
> https://syzkaller.appspot.com/x/log.txt?id=5142038655795200
> Kernel config:
> https://syzkaller.appspot.com/x/.config?id=-5034017172441945317
> compiler: gcc (GCC) 7.1.1 20170620
>
> IMPORTANT: if you fix the bug, please add the following tag to the
> commit:
> Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
>
> ============================================
> WARNING: possible recursive locking detected
> 4.16.0-rc6+ #366 Not tainted
> --------------------------------------------
> vhost-4248/4760 is trying to acquire lock:
> (&vq->mutex){+.+.}, at: [<000000003482bddc>]
> vhost_net_rx_peek_head_len drivers/vhost/net.c:633 [inline]
> (&vq->mutex){+.+.}, at: [<000000003482bddc>] handle_rx+0xeb1/0x19c0
> drivers/vhost/net.c:784
>
> but task is already holding lock:
> (&vq->mutex){+.+.}, at: [<000000004de72f44>] handle_rx+0x1f5/0x19c0
> drivers/vhost/net.c:766
>
> other info that might help us debug this:
> Possible unsafe locking scenario:
>
> CPU0
> ----
> lock(&vq->mutex);
> lock(&vq->mutex);
>
> *** DEADLOCK ***
>
> May be due to missing lock nesting notation
Yes, it's a missing of nesting notation.
Will post a patch soon.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net] vhost_net: add missing lock nesting notation
From: Jason Wang @ 2018-03-26 8:10 UTC (permalink / raw)
To: mst, kvm, virtualization, netdev, linux-kernel
We try to hold TX virtqueue mutex in vhost_net_rx_peek_head_len()
after RX virtqueue mutex is held in handle_rx(). This requires an
appropriate lock nesting notation to calm down deadlock detector.
Fixes: 0308813724606 ("vhost_net: basic polling support")
Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8139bc7..12bcfba 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -630,7 +630,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
if (!len && vq->busyloop_timeout) {
/* Both tx vq and rx socket were polled here */
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, 1);
vhost_disable_notify(&net->dev, vq);
preempt_disable();
@@ -763,7 +763,7 @@ static void handle_rx(struct vhost_net *net)
struct iov_iter fixup;
__virtio16 num_buffers;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, 0);
sock = vq->private_data;
if (!sock)
goto out;
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net] vhost_net: add missing lock nesting notation
From: Michael S. Tsirkin @ 2018-03-26 13:24 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1522051823-5166-1-git-send-email-jasowang@redhat.com>
On Mon, Mar 26, 2018 at 04:10:23PM +0800, Jason Wang wrote:
> We try to hold TX virtqueue mutex in vhost_net_rx_peek_head_len()
> after RX virtqueue mutex is held in handle_rx(). This requires an
> appropriate lock nesting notation to calm down deadlock detector.
>
> Fixes: 0308813724606 ("vhost_net: basic polling support")
> Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/net.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8139bc7..12bcfba 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -630,7 +630,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>
> if (!len && vq->busyloop_timeout) {
> /* Both tx vq and rx socket were polled here */
> - mutex_lock(&vq->mutex);
> + mutex_lock_nested(&vq->mutex, 1);
> vhost_disable_notify(&net->dev, vq);
>
> preempt_disable();
> @@ -763,7 +763,7 @@ static void handle_rx(struct vhost_net *net)
> struct iov_iter fixup;
> __virtio16 num_buffers;
>
> - mutex_lock(&vq->mutex);
> + mutex_lock_nested(&vq->mutex, 0);
> sock = vq->private_data;
> if (!sock)
> goto out;
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net] vhost_net: add missing lock nesting notation
From: David Miller @ 2018-03-26 16:59 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <1522051823-5166-1-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 26 Mar 2018 16:10:23 +0800
> We try to hold TX virtqueue mutex in vhost_net_rx_peek_head_len()
> after RX virtqueue mutex is held in handle_rx(). This requires an
> appropriate lock nesting notation to calm down deadlock detector.
>
> Fixes: 0308813724606 ("vhost_net: basic polling support")
> Reported-by: syzbot+7f073540b1384a614e09@syzkaller.appspotmail.com
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Applied and queued up for -stable, thanks Jason.
^ permalink raw reply
* Re: [RFC PATCH V2 0/8] Packed ring for vhost
From: Konrad Rzeszutek Wilk @ 2018-03-26 19:01 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <1522035533-11786-1-git-send-email-jasowang@redhat.com>
On Mon, Mar 26, 2018 at 11:38:45AM +0800, Jason Wang wrote:
> Hi all:
>
> This RFC implement packed ring layout. The code were tested with pmd
> implement by Jens at
> http://dpdk.org/ml/archives/dev/2018-January/089417.html. Minor change
> was needed for pmd codes to kick virtqueue since it assumes a busy
> polling backend.
>
> Test were done between localhost and guest. Testpmd (rxonly) in guest
> reports 2.4Mpps. Testpmd (txonly) repots about 2.1Mpps.
And how does it compare to older ring layout?
>
> Notes: The event suppression /indirect descriptor support is complied
> test only because of lacked driver support.
>
> Changes from V1:
>
> - Refactor vhost used elem code to avoid open coding on used elem
> - Event suppression support (compile test only).
> - Indirect descriptor support (compile test only).
> - Zerocopy support.
> - vIOMMU support.
> - SCSI/VSOCK support (compile test only).
> - Fix several bugs
>
> For simplicity, I don't implement batching or other optimizations.
>
> Please review.
>
> Thanks
>
> Jason Wang (8):
> vhost: move get_rx_bufs to vhost.c
> vhost: hide used ring layout from device
> vhost: do not use vring_used_elem
> vhost_net: do not explicitly manipulate vhost_used_elem
> vhost: vhost_put_user() can accept metadata type
> virtio: introduce packed ring defines
> vhost: packed ring support
> vhost: event suppression for packed ring
>
> drivers/vhost/net.c | 138 ++-----
> drivers/vhost/scsi.c | 62 +--
> drivers/vhost/vhost.c | 818 ++++++++++++++++++++++++++++++++++---
> drivers/vhost/vhost.h | 46 ++-
> drivers/vhost/vsock.c | 42 +-
> include/uapi/linux/virtio_config.h | 9 +
> include/uapi/linux/virtio_ring.h | 32 ++
> 7 files changed, 921 insertions(+), 226 deletions(-)
>
> --
> 2.7.4
>
^ permalink raw reply
* Re: [PATCH v29 1/4] mm: support reporting free page blocks
From: Andrew Morton @ 2018-03-26 21:22 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, mst, nilal,
liliang.opensource, linux-kernel, virtualization, linux-mm,
huangzhichao, pbonzini, mhocko
In-Reply-To: <1522031994-7246-2-git-send-email-wei.w.wang@intel.com>
On Mon, 26 Mar 2018 10:39:51 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
> This patch adds support to walk through the free page blocks in the
> system and report them via a callback function. Some page blocks may
> leave the free list after zone->lock is released, so it is the caller's
> responsibility to either detect or prevent the use of such pages.
>
> One use example of this patch is to accelerate live migration by skipping
> the transfer of free pages reported from the guest. A popular method used
> by the hypervisor to track which part of memory is written during live
> migration is to write-protect all the guest memory. So, those pages that
> are reported as free pages but are written after the report function
> returns will be captured by the hypervisor, and they will be added to the
> next round of memory transfer.
>
> ...
>
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4912,6 +4912,102 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> show_swap_cache_info();
> }
>
> +/*
> + * Walk through a free page list and report the found pfn range via the
> + * callback.
> + *
> + * Return 0 if it completes the reporting. Otherwise, return the non-zero
> + * value returned from the callback.
> + */
> +static int walk_free_page_list(void *opaque,
> + struct zone *zone,
> + int order,
> + enum migratetype mt,
> + int (*report_pfn_range)(void *,
> + unsigned long,
> + unsigned long))
> +{
> + struct page *page;
> + struct list_head *list;
> + unsigned long pfn, flags;
> + int ret = 0;
> +
> + spin_lock_irqsave(&zone->lock, flags);
> + list = &zone->free_area[order].free_list[mt];
> + list_for_each_entry(page, list, lru) {
> + pfn = page_to_pfn(page);
> + ret = report_pfn_range(opaque, pfn, 1 << order);
> + if (ret)
> + break;
> + }
> + spin_unlock_irqrestore(&zone->lock, flags);
> +
> + return ret;
> +}
> +
> +/**
> + * walk_free_mem_block - Walk through the free page blocks in the system
> + * @opaque: the context passed from the caller
> + * @min_order: the minimum order of free lists to check
> + * @report_pfn_range: the callback to report the pfn range of the free pages
> + *
> + * If the callback returns a non-zero value, stop iterating the list of free
> + * page blocks. Otherwise, continue to report.
> + *
> + * Please note that there are no locking guarantees for the callback and
> + * that the reported pfn range might be freed or disappear after the
> + * callback returns so the caller has to be very careful how it is used.
> + *
> + * The callback itself must not sleep or perform any operations which would
> + * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
> + * or via any lock dependency. It is generally advisable to implement
> + * the callback as simple as possible and defer any heavy lifting to a
> + * different context.
> + *
> + * There is no guarantee that each free range will be reported only once
> + * during one walk_free_mem_block invocation.
> + *
> + * pfn_to_page on the given range is strongly discouraged and if there is
> + * an absolute need for that make sure to contact MM people to discuss
> + * potential problems.
> + *
> + * The function itself might sleep so it cannot be called from atomic
> + * contexts.
I don't see how walk_free_mem_block() can sleep.
> + * In general low orders tend to be very volatile and so it makes more
> + * sense to query larger ones first for various optimizations which like
> + * ballooning etc... This will reduce the overhead as well.
> + *
> + * Return 0 if it completes the reporting. Otherwise, return the non-zero
> + * value returned from the callback.
> + */
> +int walk_free_mem_block(void *opaque,
> + int min_order,
> + int (*report_pfn_range)(void *opaque,
> + unsigned long pfn,
> + unsigned long num))
> +{
> + struct zone *zone;
> + int order;
> + enum migratetype mt;
> + int ret;
> +
> + for_each_populated_zone(zone) {
> + for (order = MAX_ORDER - 1; order >= min_order; order--) {
> + for (mt = 0; mt < MIGRATE_TYPES; mt++) {
> + ret = walk_free_page_list(opaque, zone,
> + order, mt,
> + report_pfn_range);
> + if (ret)
> + return ret;
> + }
> + }
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(walk_free_mem_block);
This looks like it could take a long time. Will we end up needing to
add cond_resched() in there somewhere?
> static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
> {
> zoneref->zone = zone;
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH v29 3/4] mm/page_poison: expose page_poisoning_enabled to kernel modules
From: Andrew Morton @ 2018-03-26 21:24 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, mst, nilal,
liliang.opensource, linux-kernel, virtualization, linux-mm,
huangzhichao, pbonzini, mhocko
In-Reply-To: <1522031994-7246-4-git-send-email-wei.w.wang@intel.com>
On Mon, 26 Mar 2018 10:39:53 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
> In some usages, e.g. virtio-balloon, a kernel module needs to know if
> page poisoning is in use. This patch exposes the page_poisoning_enabled
> function to kernel modules.
Acked-by: Andrew Morton <akpm@linux-foundation.org>
^ permalink raw reply
* Re: [RFC PATCH V2 0/8] Packed ring for vhost
From: Jason Wang @ 2018-03-27 3:33 UTC (permalink / raw)
To: Konrad Rzeszutek Wilk; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <20180326190130.GE6928@char.us.oracle.com>
On 2018年03月27日 03:01, Konrad Rzeszutek Wilk wrote:
> On Mon, Mar 26, 2018 at 11:38:45AM +0800, Jason Wang wrote:
>> Hi all:
>>
>> This RFC implement packed ring layout. The code were tested with pmd
>> implement by Jens at
>> http://dpdk.org/ml/archives/dev/2018-January/089417.html. Minor change
>> was needed for pmd codes to kick virtqueue since it assumes a busy
>> polling backend.
>>
>> Test were done between localhost and guest. Testpmd (rxonly) in guest
>> reports 2.4Mpps. Testpmd (txonly) repots about 2.1Mpps.
> And how does it compare to older ring layout?
>
No obvious difference.
Vhost itself becomes a bottleneck. I plan to do more aggressive batching
like dpdk on top.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: BUG: corrupted list in remove_wait_queue
From: Jason Wang @ 2018-03-27 3:36 UTC (permalink / raw)
To: syzbot, kvm, linux-kernel, mst, netdev, syzkaller-bugs,
virtualization
In-Reply-To: <0000000000004822e3056827baf4@google.com>
On 2018年03月24日 20:32, syzbot wrote:
> syzbot has found reproducer for the following crash on upstream commit
> 99fec39e7725d091c94d1bb0242e40c8092994f6 (Fri Mar 23 22:34:18 2018 +0000)
> Merge tag 'trace-v4.16-rc4' of
> git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
> syzbot dashboard link:
> https://syzkaller.appspot.com/bug?extid=c0272972b01b872e604a
>
> So far this crash happened 4 times on upstream.
> C reproducer is attached.
> syzkaller reproducer is attached.
> Raw console output is attached.
> .config is attached.
> compiler: gcc (GCC) 7.1.1 20170620
>
> IMPORTANT: if you fix the bug, please add the following tag to the
> commit:
> Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed.
>
> list_del corruption, 0000000054a89bb5->next is LIST_POISON1
> (00000000a63e4a19)
> ------------[ cut here ]------------
> kernel BUG at lib/list_debug.c:47!
> invalid opcode: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
> (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 4851 Comm: syzkaller762396 Not tainted 4.16.0-rc6+ #364
> Hardware name: Google Google Compute Engine/Google Compute Engine,
> BIOS Google 01/01/2011
> RIP: 0010:__list_del_entry_valid+0xd3/0x150 lib/list_debug.c:45
> RSP: 0018:ffff8801d3ff71b8 EFLAGS: 00010086
> RAX: 000000000000004e RBX: dead000000000200 RCX: 0000000000000000
> RDX: 000000000000004e RSI: 1ffff1003a7fedec RDI: ffffed003a7fee2b
> RBP: ffff8801d3ff71d0 R08: ffff8801db227fc0 R09: 1ffff1003a7fed93
> R10: ffff8801d3ff7090 R11: 0000000000000002 R12: dead000000000100
> R13: ffff8801b6a2d458 R14: ffff8801b6a2d460 R15: ffff8801d27d1780
> FS: 0000000000000000(0000) GS:ffff8801db200000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001d000 CR3: 0000000006e22002 CR4: 00000000001606f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> __list_del_entry include/linux/list.h:117 [inline]
> list_del include/linux/list.h:125 [inline]
> __remove_wait_queue include/linux/wait.h:184 [inline]
> remove_wait_queue+0x90/0x350 kernel/sched/wait.c:51
> vhost_poll_stop+0x46/0x90 drivers/vhost/vhost.c:229
> vhost_net_disable_vq drivers/vhost/net.c:405 [inline]
> vhost_net_stop_vq+0x90/0x120 drivers/vhost/net.c:973
> vhost_net_stop drivers/vhost/net.c:984 [inline]
> vhost_net_release+0x49/0x190 drivers/vhost/net.c:1017
> __fput+0x327/0x7e0 fs/file_table.c:209
> ____fput+0x15/0x20 fs/file_table.c:243
> task_work_run+0x199/0x270 kernel/task_work.c:113
> exit_task_work include/linux/task_work.h:22 [inline]
> do_exit+0x9bb/0x1ad0 kernel/exit.c:865
> do_group_exit+0x149/0x400 kernel/exit.c:968
> get_signal+0x73a/0x16d0 kernel/signal.c:2469
> do_signal+0x90/0x1e90 arch/x86/kernel/signal.c:809
> exit_to_usermode_loop+0x258/0x2f0 arch/x86/entry/common.c:162
> prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
> syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
> do_syscall_64+0x6ec/0x940 arch/x86/entry/common.c:292
> entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x44a8e9
> RSP: 002b:00007f7ec8480da8 EFLAGS: 00000293 ORIG_RAX: 0000000000000010
> RAX: 0000000000000000 RBX: 00000000006e29e4 RCX: 000000000044a8e9
> RDX: 0000000020000340 RSI: 00000000400454ca RDI: 0000000000000005
> RBP: 00000000006e29e0 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 6f68762f7665642f
> R13: 6475612f7665642f R14: 74656e2f7665642f R15: 0000000000000001
> Code: 8f 00 00 00 49 8b 54 24 08 48 39 f2 75 3b 48 83 c4 08 b8 01 00
> 00 00 5b 41 5c 5d c3 4c 89 e2 48 c7 c7 00 80 40 86 e8 85 df fb fe <0f>
> 0b 48 c7 c7 60 80 40 86 e8 77 df fb fe 0f 0b 48 c7 c7 c0 80
> RIP: __list_del_entry_valid+0xd3/0x150 lib/list_debug.c:45 RSP:
> ffff8801d3ff71b8
> ---[ end trace bdcbea47fcda73ff ]---
>
This is because we do not clear poll->wqh when poll fails, then a double
free may be triggered. Will post a patch. And I suspect we need hold vq
mutex in vhost_dev_stop().
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net] vhost: correctly remove wait queue during poll failure
From: Jason Wang @ 2018-03-27 3:47 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, linux-kernel, kvm, virtualization
We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by checking poll->wqh to make sure it was in a list.
Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
- if (poll->wqh)
- remove_wait_queue(poll->wqh, &poll->wait);
+ vhost_poll_stop(poll);
ret = -EINVAL;
}
--
2.7.4
^ permalink raw reply related
* Re: [PATCH v29 1/4] mm: support reporting free page blocks
From: Wei Wang @ 2018-03-27 6:23 UTC (permalink / raw)
To: Andrew Morton
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, mst, nilal,
liliang.opensource, linux-kernel, virtualization, linux-mm,
huangzhichao, pbonzini, mhocko
In-Reply-To: <20180326142254.c4129c3a54ade686ee2a5e21@linux-foundation.org>
On 03/27/2018 05:22 AM, Andrew Morton wrote:
> On Mon, 26 Mar 2018 10:39:51 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
>
>> This patch adds support to walk through the free page blocks in the
>> system and report them via a callback function. Some page blocks may
>> leave the free list after zone->lock is released, so it is the caller's
>> responsibility to either detect or prevent the use of such pages.
>>
>> One use example of this patch is to accelerate live migration by skipping
>> the transfer of free pages reported from the guest. A popular method used
>> by the hypervisor to track which part of memory is written during live
>> migration is to write-protect all the guest memory. So, those pages that
>> are reported as free pages but are written after the report function
>> returns will be captured by the hypervisor, and they will be added to the
>> next round of memory transfer.
>>
>> ...
>>
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4912,6 +4912,102 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
>> show_swap_cache_info();
>> }
>>
>> +/*
>> + * Walk through a free page list and report the found pfn range via the
>> + * callback.
>> + *
>> + * Return 0 if it completes the reporting. Otherwise, return the non-zero
>> + * value returned from the callback.
>> + */
>> +static int walk_free_page_list(void *opaque,
>> + struct zone *zone,
>> + int order,
>> + enum migratetype mt,
>> + int (*report_pfn_range)(void *,
>> + unsigned long,
>> + unsigned long))
>> +{
>> + struct page *page;
>> + struct list_head *list;
>> + unsigned long pfn, flags;
>> + int ret = 0;
>> +
>> + spin_lock_irqsave(&zone->lock, flags);
>> + list = &zone->free_area[order].free_list[mt];
>> + list_for_each_entry(page, list, lru) {
>> + pfn = page_to_pfn(page);
>> + ret = report_pfn_range(opaque, pfn, 1 << order);
>> + if (ret)
>> + break;
>> + }
>> + spin_unlock_irqrestore(&zone->lock, flags);
>> +
>> + return ret;
>> +}
>> +
>> +/**
>> + * walk_free_mem_block - Walk through the free page blocks in the system
>> + * @opaque: the context passed from the caller
>> + * @min_order: the minimum order of free lists to check
>> + * @report_pfn_range: the callback to report the pfn range of the free pages
>> + *
>> + * If the callback returns a non-zero value, stop iterating the list of free
>> + * page blocks. Otherwise, continue to report.
>> + *
>> + * Please note that there are no locking guarantees for the callback and
>> + * that the reported pfn range might be freed or disappear after the
>> + * callback returns so the caller has to be very careful how it is used.
>> + *
>> + * The callback itself must not sleep or perform any operations which would
>> + * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
>> + * or via any lock dependency. It is generally advisable to implement
>> + * the callback as simple as possible and defer any heavy lifting to a
>> + * different context.
>> + *
>> + * There is no guarantee that each free range will be reported only once
>> + * during one walk_free_mem_block invocation.
>> + *
>> + * pfn_to_page on the given range is strongly discouraged and if there is
>> + * an absolute need for that make sure to contact MM people to discuss
>> + * potential problems.
>> + *
>> + * The function itself might sleep so it cannot be called from atomic
>> + * contexts.
> I don't see how walk_free_mem_block() can sleep.
OK, it would be better to remove this sentence for the current version.
But I think we could probably keep it if we decide to add cond_resched()
below.
>
>> + * In general low orders tend to be very volatile and so it makes more
>> + * sense to query larger ones first for various optimizations which like
>> + * ballooning etc... This will reduce the overhead as well.
>> + *
>> + * Return 0 if it completes the reporting. Otherwise, return the non-zero
>> + * value returned from the callback.
>> + */
>> +int walk_free_mem_block(void *opaque,
>> + int min_order,
>> + int (*report_pfn_range)(void *opaque,
>> + unsigned long pfn,
>> + unsigned long num))
>> +{
>> + struct zone *zone;
>> + int order;
>> + enum migratetype mt;
>> + int ret;
>> +
>> + for_each_populated_zone(zone) {
>> + for (order = MAX_ORDER - 1; order >= min_order; order--) {
>> + for (mt = 0; mt < MIGRATE_TYPES; mt++) {
>> + ret = walk_free_page_list(opaque, zone,
>> + order, mt,
>> + report_pfn_range);
>> + if (ret)
>> + return ret;
>> + }
==>
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(walk_free_mem_block);
> This looks like it could take a long time. Will we end up needing to
> add cond_resched() in there somewhere?
OK. How about adding cond_resched at the above place "==>" (i.e. every
order)?
Best,
Wei
^ permalink raw reply
* Re: [PATCH v29 1/4] mm: support reporting free page blocks
From: Michal Hocko @ 2018-03-27 6:33 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, mst,
liliang.opensource, linux-kernel, virtualization, linux-mm,
huangzhichao, pbonzini, Andrew Morton, nilal
In-Reply-To: <5AB9E377.30900@intel.com>
On Tue 27-03-18 14:23:51, Wei Wang wrote:
> On 03/27/2018 05:22 AM, Andrew Morton wrote:
> > On Mon, 26 Mar 2018 10:39:51 +0800 Wei Wang <wei.w.wang@intel.com> wrote:
> >
> > > This patch adds support to walk through the free page blocks in the
> > > system and report them via a callback function. Some page blocks may
> > > leave the free list after zone->lock is released, so it is the caller's
> > > responsibility to either detect or prevent the use of such pages.
> > >
> > > One use example of this patch is to accelerate live migration by skipping
> > > the transfer of free pages reported from the guest. A popular method used
> > > by the hypervisor to track which part of memory is written during live
> > > migration is to write-protect all the guest memory. So, those pages that
> > > are reported as free pages but are written after the report function
> > > returns will be captured by the hypervisor, and they will be added to the
> > > next round of memory transfer.
> > >
> > > ...
> > >
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -4912,6 +4912,102 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> > > show_swap_cache_info();
> > > }
> > > +/*
> > > + * Walk through a free page list and report the found pfn range via the
> > > + * callback.
> > > + *
> > > + * Return 0 if it completes the reporting. Otherwise, return the non-zero
> > > + * value returned from the callback.
> > > + */
> > > +static int walk_free_page_list(void *opaque,
> > > + struct zone *zone,
> > > + int order,
> > > + enum migratetype mt,
> > > + int (*report_pfn_range)(void *,
> > > + unsigned long,
> > > + unsigned long))
> > > +{
> > > + struct page *page;
> > > + struct list_head *list;
> > > + unsigned long pfn, flags;
> > > + int ret = 0;
> > > +
> > > + spin_lock_irqsave(&zone->lock, flags);
> > > + list = &zone->free_area[order].free_list[mt];
> > > + list_for_each_entry(page, list, lru) {
> > > + pfn = page_to_pfn(page);
> > > + ret = report_pfn_range(opaque, pfn, 1 << order);
> > > + if (ret)
> > > + break;
> > > + }
> > > + spin_unlock_irqrestore(&zone->lock, flags);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +/**
> > > + * walk_free_mem_block - Walk through the free page blocks in the system
> > > + * @opaque: the context passed from the caller
> > > + * @min_order: the minimum order of free lists to check
> > > + * @report_pfn_range: the callback to report the pfn range of the free pages
> > > + *
> > > + * If the callback returns a non-zero value, stop iterating the list of free
> > > + * page blocks. Otherwise, continue to report.
> > > + *
> > > + * Please note that there are no locking guarantees for the callback and
> > > + * that the reported pfn range might be freed or disappear after the
> > > + * callback returns so the caller has to be very careful how it is used.
> > > + *
> > > + * The callback itself must not sleep or perform any operations which would
> > > + * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
> > > + * or via any lock dependency. It is generally advisable to implement
> > > + * the callback as simple as possible and defer any heavy lifting to a
> > > + * different context.
> > > + *
> > > + * There is no guarantee that each free range will be reported only once
> > > + * during one walk_free_mem_block invocation.
> > > + *
> > > + * pfn_to_page on the given range is strongly discouraged and if there is
> > > + * an absolute need for that make sure to contact MM people to discuss
> > > + * potential problems.
> > > + *
> > > + * The function itself might sleep so it cannot be called from atomic
> > > + * contexts.
> > I don't see how walk_free_mem_block() can sleep.
>
> OK, it would be better to remove this sentence for the current version. But
> I think we could probably keep it if we decide to add cond_resched() below.
The point of this sentence was to make any user aware that the function
might sleep from the very begining rather than chase existing callers
when we need to add cond_resched or sleep for any other reason. So I
would rather keep it.
--
Michal Hocko
SUSE Labs
^ permalink raw reply
* Re: [PATCH 00/23] drm: Eliminate plane->fb/crtc usage for atomic drivers
From: Daniel Vetter @ 2018-03-27 8:21 UTC (permalink / raw)
To: Ville Syrjala
Cc: David Airlie, Daniel Vetter, dri-devel, chris, Eric Anholt,
Benjamin Gaignard, Dave Airlie, Boris Brezillon, Thomas Hellstrom,
Joonyoung Shim, Sinclair Yeh, Rob Clark, amd-gfx, martin.peres,
VMware Graphics, Harry Wentland, David (ChunMing) Zhou,
linux-arm-msm, intel-gfx, Maarten Lankhorst, Inki Dae,
virtualization, Vincent Abriou
In-Reply-To: <20180322152313.6561-1-ville.syrjala@linux.intel.com>
On Thu, Mar 22, 2018 at 05:22:50PM +0200, Ville Syrjala wrote:
> From: Ville Syrjälä <ville.syrjala@linux.intel.com>
>
> I really just wanted to fix i915 to re-enable its planes afer load
> detection (a two line patch). This is what I actually ended up with
> after I ran into a framebuffer refcount leak with said two line patch.
>
> I've tested this on a few i915 boxes and so far it's looking
> good. Everything else is just compile tested.
>
> Entire series available here:
> git://github.com/vsyrjala/linux.git plane_fb_crtc_nuke
>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
> Cc: Benjamin Gaignard <benjamin.gaignard@linaro.org>
> Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
> Cc: chris@chris-wilson.co.uk
> Cc: "Christian König" <christian.koenig@amd.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Dave Airlie <airlied@gmail.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com>
> Cc: Eric Anholt <eric@anholt.net>
> Cc: freedreno@lists.freedesktop.org
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Harry Wentland <harry.wentland@amd.com>
> Cc: Inki Dae <inki.dae@samsung.com>
> Cc: Joonyoung Shim <jy0922.shim@samsung.com>
> Cc: Kyungmin Park <kyungmin.park@samsung.com>
> Cc: linux-arm-msm@vger.kernel.org
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: martin.peres@free.fr
> Cc: Rob Clark <robdclark@gmail.com>
> Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
> Cc: Shawn Guo <shawnguo@kernel.org>
> Cc: Sinclair Yeh <syeh@vmware.com>
> Cc: Thomas Hellstrom <thellstrom@vmware.com>
> Cc: Vincent Abriou <vincent.abriou@st.com>
> Cc: virtualization@lists.linux-foundation.org
> Cc: VMware Graphics <linux-graphics-maintainer@vmware.com>
>
> Ville Syrjälä (23):
> Revert "drm/atomic-helper: Fix leak in disable_all"
> drm/atomic-helper: Make drm_atomic_helper_disable_all() update the
> plane->fb pointers
> drm: Clear crtc->primary->crtc when disabling the crtc via setcrtc()
> drm/atomic-helper: WARN if legacy plane fb pointers are bogus when
> committing duplicated state
> drm: Add local 'plane' variable for primary/cursor planes
> drm: Adjust whitespace for legibility
> drm: Make the fb refcount handover less magic
> drm: Use plane->state->fb over plane->fb
> drm/i915: Stop consulting plane->fb
> drm/msm: Stop consulting plane->fb
> drm/sti: Stop consulting plane->fb
> drm/vmwgfx: Stop consulting plane->fb
> drm/zte: Stop consulting plane->fb
> drm/atmel-hlcdc: Stop using plane->fb
> drm: Stop updating plane->crtc/fb/old_fb on atomic drivers
> drm/amdgpu/dc: Stop updating plane->fb
> drm/i915: Stop updating plane->fb/crtc
> drm/exynos: Stop updating plane->crtc
> drm/msm: Stop updating plane->fb/crtc
> drm/virtio: Stop updating plane->fb
> drm/vc4: Stop updating plane->fb/crtc
> drm/i915: Restore planes after load detection
> drm/i915: Make force_load_detect effective even w/ DMI quirks/hotplug
Ok, I reviewed the core patches, looks all good.
Also starting auditing all the drivers. I think there's some small
oversights in there, and I need to update my grep foo a bit, but by and
large looks all reasonable.
Imo we should start merging, that will also make the auditing easier.
-Daniel
>
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 -
> drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c | 12 +---
> drivers/gpu/drm/drm_atomic.c | 55 ++--------------
> drivers/gpu/drm/drm_atomic_helper.c | 79 ++++++++++-------------
> drivers/gpu/drm/drm_crtc.c | 51 ++++++++++-----
> drivers/gpu/drm/drm_fb_helper.c | 7 --
> drivers/gpu/drm/drm_framebuffer.c | 5 --
> drivers/gpu/drm/drm_plane.c | 64 +++++++++++-------
> drivers/gpu/drm/drm_plane_helper.c | 4 +-
> drivers/gpu/drm/exynos/exynos_drm_plane.c | 2 -
> drivers/gpu/drm/i915/intel_crt.c | 6 ++
> drivers/gpu/drm/i915/intel_display.c | 9 +--
> drivers/gpu/drm/i915/intel_fbdev.c | 2 +-
> drivers/gpu/drm/msm/disp/mdp4/mdp4_crtc.c | 3 +-
> drivers/gpu/drm/msm/disp/mdp4/mdp4_plane.c | 2 -
> drivers/gpu/drm/msm/disp/mdp5/mdp5_plane.c | 2 -
> drivers/gpu/drm/sti/sti_plane.c | 9 +--
> drivers/gpu/drm/vc4/vc4_crtc.c | 3 -
> drivers/gpu/drm/virtio/virtgpu_display.c | 2 -
> drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 6 +-
> drivers/gpu/drm/zte/zx_vou.c | 2 +-
> include/drm/drm_atomic.h | 3 -
> 22 files changed, 143 insertions(+), 187 deletions(-)
>
> --
> 2.16.1
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
^ permalink raw reply
* Re: [PATCH] vhost-net: add time limitation for tx polling
From: Jason Wang @ 2018-03-27 9:40 UTC (permalink / raw)
To: haibinzhang(张海斌), mst@redhat.com,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <88D661ADF6AFBF42B2AB88D8E7682B0901FC4059@EXMBX-SZMAIL011.tencent.com>
On 2018年03月27日 17:12, haibinzhang(张海斌) wrote:
> handle_tx() will delay rx for a long time when busy tx polling udp packets
> with short length(ie: 1byte udp payload), because setting VHOST_NET_WEIGHT
> takes into account only sent-bytes but no time.
Interesting.
Looking at vhost_can_busy_poll() it tries to poke pending vhost work and
exit the busy loop if it found one. So I believe something block the
work queuing. E.g did reverting 8241a1e466cd56e6c10472cac9c1ad4e54bc65db
fix the issue?
> It's not fair for handle_rx(),
> so needs to limit max time of tx polling.
>
> Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
> ---
> drivers/vhost/net.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8139bc70ad7d..dc9218a3a75b 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -473,6 +473,7 @@ static void handle_tx(struct vhost_net *net)
> struct socket *sock;
> struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> bool zcopy, zcopy_used;
> + unsigned long start = jiffies;
Checking jiffies is tricky, need to convert it to ms or whatever others.
>
> mutex_lock(&vq->mutex);
> sock = vq->private_data;
> @@ -580,7 +581,7 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> + if (unlikely(total_len >= VHOST_NET_WEIGHT) || unlikely(jiffies - start >= 1)) {
How value 1 is determined here? And we need a complete test to make sure
this won't affect other use cases.
Another thought is introduce another limit of #packets, but this need
benchmark too.
Thanks
> vhost_poll_queue(&vq->poll);
> break;
> }
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net] vhost: correctly remove wait queue during poll failure
From: Jason Wang @ 2018-03-27 9:43 UTC (permalink / raw)
To: mst, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <20180327092811.ikvssyxgb74nucb4@dhcp-10-175-199-170.vpn.oracle.com>
On 2018年03月27日 17:28, Darren Kenny wrote:
> Hi Jason,
>
> On Tue, Mar 27, 2018 at 11:47:22AM +0800, Jason Wang wrote:
>> We tried to remove vq poll from wait queue, but do not check whether
>> or not it was in a list before. This will lead double free. Fixing
>> this by checking poll->wqh to make sure it was in a list.
>
> This text seems at odds with the code below, instead of checking
> poll-whq, you are removing that check...
>
> Maybe the text needs rewording?
Yes, I admit it's bad, thanks for pointing out.
How about:
"Fixing this by switching to use vhost_poll_stop() which zeros poll->wqh
after removing poll from waitqueue to make sure it won't be freed twice."
Thanks
>
> Thanks,
>
> Darren.
>
>>
>> Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
>> Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting
>> backend")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> drivers/vhost/vhost.c | 3 +--
>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 1b3e8d2d..5d5a9d9 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll,
>> struct file *file)
>> if (mask)
>> vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
>> if (mask & EPOLLERR) {
>> - if (poll->wqh)
>> - remove_wait_queue(poll->wqh, &poll->wait);
>> + vhost_poll_stop(poll);
>> ret = -EINVAL;
>> }
>>
>> --
>> 2.7.4
>>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net V2] vhost: correctly remove wait queue during poll failure
From: Jason Wang @ 2018-03-27 12:50 UTC (permalink / raw)
To: mst, jasowang; +Cc: netdev, Darren Kenny, linux-kernel, kvm, virtualization
We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by switching to use vhost_poll_stop() which zeros poll->wqh after
removing poll from waitqueue to make sure it won't be freed twice.
Cc: Darren Kenny <darren.kenny@oracle.com>
Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from V1:
- tweak the commit log for to match the code
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
- if (poll->wqh)
- remove_wait_queue(poll->wqh, &poll->wait);
+ vhost_poll_stop(poll);
ret = -EINVAL;
}
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net] vhost: correctly remove wait queue during poll failure
From: Michael S. Tsirkin @ 2018-03-27 13:58 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <b1576bcf-652a-7c1a-9d77-db54931ab979@redhat.com>
On Tue, Mar 27, 2018 at 05:43:14PM +0800, Jason Wang wrote:
>
>
> On 2018年03月27日 17:28, Darren Kenny wrote:
> > Hi Jason,
> >
> > On Tue, Mar 27, 2018 at 11:47:22AM +0800, Jason Wang wrote:
> > > We tried to remove vq poll from wait queue, but do not check whether
> > > or not it was in a list before. This will lead double free. Fixing
> > > this by checking poll->wqh to make sure it was in a list.
> >
> > This text seems at odds with the code below, instead of checking
> > poll-whq, you are removing that check...
> >
> > Maybe the text needs rewording?
>
> Yes, I admit it's bad, thanks for pointing out.
>
> How about:
>
> "Fixing this by switching to use vhost_poll_stop() which zeros poll->wqh
> after removing poll from waitqueue to make sure it won't be freed twice."
>
> Thanks
Let's be a bit more specific about the problem maybe?
when vhost's attempt to start polling a descriptor fails, we remove the
poll->wqh entry from wait queue but do not clear it, so the following
cleanup (e.g. on release) will attempt to remove it again.
To fix, switch to vhost_poll_stop() which zeros poll->wqh
after removing poll from waitqueue to make sure it won't be freed twice."
the patch itself is fine though:
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > Thanks,
> >
> > Darren.
> >
> > >
> > > Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
> > > Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting
> > > backend")
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > > drivers/vhost/vhost.c | 3 +--
> > > 1 file changed, 1 insertion(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index 1b3e8d2d..5d5a9d9 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll,
> > > struct file *file)
> > > if (mask)
> > > vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
> > > if (mask & EPOLLERR) {
> > > - if (poll->wqh)
> > > - remove_wait_queue(poll->wqh, &poll->wait);
> > > + vhost_poll_stop(poll);
> > > ret = -EINVAL;
> > > }
> > >
> > > --
> > > 2.7.4
> > >
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net V2] vhost: correctly remove wait queue during poll failure
From: Michael S. Tsirkin @ 2018-03-27 13:59 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, Darren Kenny, linux-kernel, kvm, virtualization
In-Reply-To: <1522155052-13347-1-git-send-email-jasowang@redhat.com>
On Tue, Mar 27, 2018 at 08:50:52PM +0800, Jason Wang wrote:
> We tried to remove vq poll from wait queue, but do not check whether
> or not it was in a list before. This will lead double free. Fixing
> this by switching to use vhost_poll_stop() which zeros poll->wqh after
> removing poll from waitqueue to make sure it won't be freed twice.
>
> Cc: Darren Kenny <darren.kenny@oracle.com>
> Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
> Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> Changes from V1:
> - tweak the commit log for to match the code
> ---
> drivers/vhost/vhost.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 1b3e8d2d..5d5a9d9 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
> if (mask)
> vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
> if (mask & EPOLLERR) {
> - if (poll->wqh)
> - remove_wait_queue(poll->wqh, &poll->wait);
> + vhost_poll_stop(poll);
> ret = -EINVAL;
> }
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH v29 1/4] mm: support reporting free page blocks
From: Michael S. Tsirkin @ 2018-03-27 16:07 UTC (permalink / raw)
To: Michal Hocko
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm,
liliang.opensource, linux-kernel, virtualization, linux-mm,
huangzhichao, pbonzini, Andrew Morton, nilal
In-Reply-To: <20180327063322.GW5652@dhcp22.suse.cz>
On Tue, Mar 27, 2018 at 08:33:22AM +0200, Michal Hocko wrote:
> > > > + * The function itself might sleep so it cannot be called from atomic
> > > > + * contexts.
> > > I don't see how walk_free_mem_block() can sleep.
> >
> > OK, it would be better to remove this sentence for the current version. But
> > I think we could probably keep it if we decide to add cond_resched() below.
>
> The point of this sentence was to make any user aware that the function
> might sleep from the very begining rather than chase existing callers
> when we need to add cond_resched or sleep for any other reason. So I
> would rather keep it.
Let's say what it is then - "will be changed to sleep in the future".
> --
> Michal Hocko
> SUSE Labs
^ permalink raw reply
* Re: [PATCH net V2] vhost: correctly remove wait queue during poll failure
From: David Miller @ 2018-03-27 17:04 UTC (permalink / raw)
To: jasowang; +Cc: kvm, mst, netdev, linux-kernel, virtualization, darren.kenny
In-Reply-To: <1522155052-13347-1-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 27 Mar 2018 20:50:52 +0800
> We tried to remove vq poll from wait queue, but do not check whether
> or not it was in a list before. This will lead double free. Fixing
> this by switching to use vhost_poll_stop() which zeros poll->wqh after
> removing poll from waitqueue to make sure it won't be freed twice.
>
> Cc: Darren Kenny <darren.kenny@oracle.com>
> Reported-by: syzbot+c0272972b01b872e604a@syzkaller.appspotmail.com
> Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V1:
> - tweak the commit log for to match the code
Applied and queued up for -stable, thank you.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox