Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC v4 3/5] virtio_ring: add packed ring support
From: Jason Wang @ 2018-05-16 11:50 UTC (permalink / raw)
  To: Tiwei Bie, mst, virtualization, linux-kernel, netdev; +Cc: wexu, jfreimann
In-Reply-To: <20180516083737.26504-4-tiwei.bie@intel.com>



On 2018年05月16日 16:37, Tiwei Bie wrote:
> This commit introduces the basic support (without EVENT_IDX)
> for packed ring.
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c | 491 ++++++++++++++++++++++++++++++++++-
>   1 file changed, 481 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 62d7c407841a..c6c5deb0e3ae 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,7 +58,8 @@
>   
>   struct vring_desc_state {
>   	void *data;			/* Data for callback. */
> -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +	void *indir_desc;		/* Indirect descriptor, if any. */
> +	int num;			/* Descriptor list length. */
>   };
>   
>   struct vring_virtqueue {
> @@ -116,6 +117,9 @@ struct vring_virtqueue {
>   			/* Last written value to driver->flags in
>   			 * guest byte order. */
>   			u16 event_flags_shadow;
> +
> +			/* ID allocation. */
> +			struct idr buffer_id;

I'm not sure idr is fit for the performance critical case here. Need to 
measure its performance impact, especially if we have few unused slots.

>   		};
>   	};
>   
> @@ -142,6 +146,16 @@ struct vring_virtqueue {
>   
>   #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
>   
> +static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
> +					  unsigned int total_sg)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	/* If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect. FIXME: tune this threshold */
> +	return (vq->indirect && total_sg > 1 && vq->vq.num_free);
> +}
> +
>   /*
>    * Modern virtio devices have feature bits to specify whether they need a
>    * quirk and bypass the IOMMU. If not there, just use the DMA API.
> @@ -327,9 +341,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>   
>   	head = vq->free_head;
>   
> -	/* If the host supports indirect descriptor tables, and we have multiple
> -	 * buffers, then go indirect. FIXME: tune this threshold */
> -	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> +	if (virtqueue_use_indirect(_vq, total_sg))
>   		desc = alloc_indirect_split(_vq, total_sg, gfp);
>   	else {
>   		desc = NULL;
> @@ -741,6 +753,63 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
>   		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
>   }
>   
> +static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
> +				   struct vring_packed_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.
> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> +	return desc;
> +}
> +
> +static u16 alloc_id_packed(struct vring_virtqueue *vq)
> +{
> +	u16 id;
> +
> +	id = idr_alloc(&vq->buffer_id, NULL, 0, vq->vring_packed.num,
> +		       GFP_KERNEL);
> +	return id;
> +}
> +
> +static void free_id_packed(struct vring_virtqueue *vq, u16 id)
> +{
> +	idr_remove(&vq->buffer_id, id);
> +}
> +
>   static inline int virtqueue_add_packed(struct virtqueue *_vq,
>   				       struct scatterlist *sgs[],
>   				       unsigned int total_sg,
> @@ -750,47 +819,446 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>   				       void *ctx,
>   				       gfp_t gfp)
>   {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	u16 head, wrap_counter, id;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif
> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->next_avail_idx;
> +	wrap_counter = vq->wrap_counter;
> +
> +	if (virtqueue_use_indirect(_vq, total_sg))
> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> +	}
> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	id = alloc_id_packed(vq);
> +
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> +					       DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> +					VRING_DESC_F_USED(!vq->wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			i++;
> +			if (!indirect && i >= vq->vring_packed.num) {
> +				i = 0;
> +				vq->wrap_counter ^= 1;
> +			}
> +		}
> +	}
> +
> +	prev = (i > 0 ? i : vq->vring_packed.num) - 1;
> +	desc[prev].id = cpu_to_virtio16(_vq->vdev, id);
> +
> +	/* Last one doesn't continue. */
> +	if (total_sg == 1)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
> +						~VRING_DESC_F_NEXT);
> +
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +					     VRING_DESC_F_AVAIL(wrap_counter) |
> +					     VRING_DESC_F_USED(!wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
> +								   addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->wrap_counter ^= 1;
> +		}
> +		vq->next_avail_idx = n;
> +	} else
> +		vq->next_avail_idx = i;
> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state[id].num = descs_used;
> +	vq->desc_state[id].data = data;
> +	if (indirect)
> +		vq->desc_state[id].indir_desc = desc;
> +	else
> +		vq->desc_state[id].indir_desc = ctx;
> +
> +	/* A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available. */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added += descs_used;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_one_packed(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->wrap_counter = wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	free_id_packed(vq, id);
> +
> +	END_USE(vq);
>   	return -EIO;
>   }
>   
>   static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 flags;
> +	bool needs_kick;
> +	u32 snapshot;
> +
> +	START_USE(vq);
> +	/* We need to expose the new flags value before checking notification
> +	 * suppressions. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	snapshot = *(u32 *)vq->vring_packed.device;
> +	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> +			      unsigned int id, void **ctx)
> +{
> +	struct vring_packed_desc *desc;
> +	unsigned int i, j;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state[id].data = NULL;
> +
> +	i = head;
> +
> +	for (j = 0; j < vq->desc_state[id].num; j++) {
> +		desc = &vq->vring_packed.desc[i];
> +		vring_unmap_one_packed(vq, desc);

As mentioned in previous discussion, this probably won't work for the 
case of out of order completion since it depends on the information in 
the descriptor ring. We probably need to extend ctx to record such 
information.

Thanks

> +		i++;
> +		if (i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->vq.num_free += vq->desc_state[id].num;
> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		desc = vq->desc_state[id].indir_desc;
> +		if (!desc)
> +			goto out;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev,
> +				      vq->vring_packed.desc[head].len);
> +
> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> +			vring_unmap_one_packed(vq, &desc[j]);
> +
> +		kfree(desc);
> +		vq->desc_state[id].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[id].indir_desc;
> +	}
> +
> +out:
> +	free_id_packed(vq, id);
>   }
>   
>   static inline bool more_used_packed(const struct vring_virtqueue *vq)
>   {
> -	return false;
> +	u16 last_used, flags;
> +	bool avail, used;
> +
> +	if (vq->vq.num_free == vq->vring_packed.num)
> +		return false;
> +
> +	last_used = vq->last_used_idx;
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +
> +	return avail == used;
>   }
>   
>   static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>   					  unsigned int *len,
>   					  void **ctx)
>   {
> -	return NULL;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used, id;
> +	void *ret;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used_packed(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used elements after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +	id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(id >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", id);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[id].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", id);
> +		return NULL;
> +	}
> +
> +	vq->last_used_idx += vq->desc_state[id].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num)
> +		vq->last_used_idx -= vq->vring_packed.num;
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state[id].data;
> +	detach_buf_packed(vq, last_used, id, ctx);
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
>   }
>   
>   static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
>   {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
>   }
>   
>   static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>   {
> -	return 0;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		virtio_wmb(vq->weak_barriers);
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	END_USE(vq);
> +	return vq->last_used_idx;
>   }
>   
>   static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
>   {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	bool avail, used;
> +	u16 flags;
> +
> +	virtio_mb(vq->weak_barriers);
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +			vq->vring_packed.desc[last_used_idx].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +	return avail == used;
>   }
>   
>   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>   {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		virtio_wmb(vq->weak_barriers);
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	if (more_used_packed(vq)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
>   }
>   
>   static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
>   {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 flags, head, id, i;
> +	unsigned int len;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	/* Detach the used descriptors. */
> +	if (more_used_packed(vq)) {
> +		buf = virtqueue_get_buf_ctx_packed(_vq, &len, NULL);
> +		END_USE(vq);
> +		return buf;
> +	}
> +
> +	/* Detach the available descriptors. */
> +	for (i = vq->last_used_idx; i != vq->next_avail_idx;
> +			i = (i + 1) % vq->vring_packed.num) {
> +		flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[i].flags);
> +		while (flags & VRING_DESC_F_NEXT) {
> +			i = (i + 1) % vq->vring_packed.num;
> +			flags = virtio16_to_cpu(vq->vq.vdev,
> +					vq->vring_packed.desc[i].flags);
> +		}
> +		id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[i].id);
> +		if (!vq->desc_state[id].data)
> +			continue;
> +
> +		len = vq->desc_state[id].num - 1;
> +		head = (i < len ? i + vq->vring_packed.num : i) - len;
> +
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[id].data;
> +		detach_buf_packed(vq, head, id, NULL);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> +	END_USE(vq);
>   	return NULL;
>   }
>   
> @@ -1198,6 +1666,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   		vq->next_avail_idx = 0;
>   		vq->wrap_counter = 1;
>   		vq->event_flags_shadow = 0;
> +		idr_init(&vq->buffer_id);
>   	} else {
>   		vq->vring = vring.vring_split;
>   		vq->avail_flags_shadow = 0;
> @@ -1384,6 +1853,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>   					      (void *)vq->vring.desc,
>   				 vq->queue_dma_addr);
>   	}
> +	if (vq->packed)
> +		idr_destroy(&vq->buffer_id);
>   	list_del(&_vq->list);
>   	kfree(vq);
>   }

^ permalink raw reply

* Re: [PATCH 12/14] net: sched: retry action check-insert on concurrent modification
From: Vlad Buslov @ 2018-05-16 11:55 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <20180516095953.GI1972@nanopsycho>


On Wed 16 May 2018 at 09:59, Jiri Pirko <jiri@resnulli.us> wrote:
> Mon, May 14, 2018 at 04:27:13PM CEST, vladbu@mellanox.com wrote:
>>Retry check-insert sequence in action init functions if action with same
>>index was inserted concurrently.
>>
>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>---
>> net/sched/act_bpf.c        | 8 +++++++-
>> net/sched/act_connmark.c   | 8 +++++++-
>> net/sched/act_csum.c       | 8 +++++++-
>> net/sched/act_gact.c       | 8 +++++++-
>> net/sched/act_ife.c        | 8 +++++++-
>> net/sched/act_ipt.c        | 8 +++++++-
>> net/sched/act_mirred.c     | 8 +++++++-
>> net/sched/act_nat.c        | 8 +++++++-
>> net/sched/act_pedit.c      | 8 +++++++-
>> net/sched/act_police.c     | 9 ++++++++-
>> net/sched/act_sample.c     | 8 +++++++-
>> net/sched/act_simple.c     | 9 ++++++++-
>> net/sched/act_skbedit.c    | 8 +++++++-
>> net/sched/act_skbmod.c     | 8 +++++++-
>> net/sched/act_tunnel_key.c | 9 ++++++++-
>> net/sched/act_vlan.c       | 9 ++++++++-
>> 16 files changed, 116 insertions(+), 16 deletions(-)
>>
>>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>>index 5554bf7..7e20fdc 100644
>>--- a/net/sched/act_bpf.c
>>+++ b/net/sched/act_bpf.c
>>@@ -299,10 +299,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
>> 
>> 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
>> 
>>+replay:
>> 	if (!tcf_idr_check(tn, parm->index, act, bind)) {
>> 		ret = tcf_idr_create(tn, parm->index, est, act,
>> 				     &act_bpf_ops, bind, true);
>>-		if (ret < 0)
>>+		/* Action with specified index was created concurrently.
>>+		 * Check again.
>>+		 */
>>+		if (parm->index && ret == -ENOSPC)
>>+			goto replay;
>>+		else if (ret)
>
> Hmm, looks like you are doing the same/very similar thing in every act
> code. I think it would make sense to introduce a helper function for
> this purpose.

This code uses goto so it can't be easily refactored into standalone
function. Could you specify which part of this code you suggest to
extract?

>
> [...]

^ permalink raw reply

* [PATCH ghak81 V3 0/3] audit: group task params
From: Richard Guy Briggs @ 2018-05-16 11:55 UTC (permalink / raw)
  To: Linux-Audit Mailing List, LKML,
	Linux NetDev Upstream Mailing List, Netfilter Devel List,
	Linux Security Module list, Integrity Measurement Architecture,
	SElinux list
  Cc: Richard Guy Briggs, David Howells, Ingo Molnar

Group the audit parameters for each task into one structure.
In particular, remove the loginuid and sessionid values and the audit
context pointer from the task structure, replacing them with an audit
task information structure to contain them.  Use access functions to
access audit values.

Use dynamic allocation of the audit task information structure employing
kmem_cache.  Static allocation has the limitation that future audit task
information structure changes would cause a visible change to the rest
of the kernel, whereas dynamic allocation would mostly hide any future
changes.

Passes audit-testsuite.

Changelog:
v3
- drop patches 2, 3, 4 already merged.
- fix for previous v2 patch 3 (seccomp get audit_context)
- dynamic audit_task_info allocation from kmem_cache
- fix assignment in if statement v2 patch 1 (normalize loginuid read)
- fix a number of merge conflicts/checkpatch
v2
- p2/5: add audit header to init/init_task.c to quiet kbuildbot
- audit_signal_info(): fetch loginuid once
- remove task_struct from audit_context() param list
- remove extra task_struct local vars
- do nothing on request to set audit context when audit is disabled

Richard Guy Briggs (3):
  audit: use new audit_context access funciton for
    seccomp_actions_logged
  audit: normalize loginuid read access
  audit: collect audit task parameters

 include/linux/audit.h | 34 ++++++++++++++++-------
 include/linux/sched.h |  5 +---
 init/init_task.c      |  3 +-
 init/main.c           |  2 ++
 kernel/auditsc.c      | 77 ++++++++++++++++++++++++++++++++++++++-------------
 kernel/fork.c         |  2 +-
 6 files changed, 87 insertions(+), 36 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH ghak81 V3 1/3] audit: use new audit_context access funciton for seccomp_actions_logged
From: Richard Guy Briggs @ 2018-05-16 11:55 UTC (permalink / raw)
  To: Linux-Audit Mailing List, LKML,
	Linux NetDev Upstream Mailing List, Netfilter Devel List,
	Linux Security Module list, Integrity Measurement Architecture,
	SElinux list
  Cc: Eric Paris, Paul Moore, Steve Grubb, Ingo Molnar, David Howells,
	Richard Guy Briggs
In-Reply-To: <cover.1526430313.git.rgb@redhat.com>

On the rebase of the following commit on the new seccomp actions_logged
function, one audit_context access was missed.

commit cdfb6b341f0f2409aba24b84f3b4b2bba50be5c5
("audit: use inline function to get audit context")

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cbab0da..f3d3dc6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2497,7 +2497,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
 	if (!audit_enabled)
 		return;
 
-	ab = audit_log_start(current->audit_context, GFP_KERNEL,
+	ab = audit_log_start(audit_context(), GFP_KERNEL,
 			     AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak81 V3 2/3] audit: normalize loginuid read access
From: Richard Guy Briggs @ 2018-05-16 11:55 UTC (permalink / raw)
  To: Linux-Audit Mailing List, LKML,
	Linux NetDev Upstream Mailing List, Netfilter Devel List,
	Linux Security Module list, Integrity Measurement Architecture,
	SElinux list
  Cc: Richard Guy Briggs, David Howells, Ingo Molnar
In-Reply-To: <cover.1526430313.git.rgb@redhat.com>

Recognizing that the loginuid is an internal audit value, use an access
function to retrieve the audit loginuid value for the task rather than
reaching directly into the task struct to get it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3d3dc6..ef3e189 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk,
 	case AUDIT_COMPARE_EGID_TO_OBJ_GID:
 		return audit_compare_gid(cred->egid, name, f, ctx);
 	case AUDIT_COMPARE_AUID_TO_OBJ_UID:
-		return audit_compare_uid(tsk->loginuid, name, f, ctx);
+		return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
 	case AUDIT_COMPARE_SUID_TO_OBJ_UID:
 		return audit_compare_uid(cred->suid, name, f, ctx);
 	case AUDIT_COMPARE_SGID_TO_OBJ_GID:
@@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_gid(cred->fsgid, name, f, ctx);
 	/* uid comparisons */
 	case AUDIT_COMPARE_UID_TO_AUID:
-		return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+		return audit_uid_comparator(cred->uid, f->op,
+					    audit_get_loginuid(tsk));
 	case AUDIT_COMPARE_UID_TO_EUID:
 		return audit_uid_comparator(cred->uid, f->op, cred->euid);
 	case AUDIT_COMPARE_UID_TO_SUID:
@@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
 	/* auid comparisons */
 	case AUDIT_COMPARE_AUID_TO_EUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->euid);
 	case AUDIT_COMPARE_AUID_TO_SUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->suid);
 	case AUDIT_COMPARE_AUID_TO_FSUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->fsuid);
 	/* euid comparisons */
 	case AUDIT_COMPARE_EUID_TO_SUID:
 		return audit_uid_comparator(cred->euid, f->op, cred->suid);
@@ -611,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = match_tree_refs(ctx, rule->tree);
 			break;
 		case AUDIT_LOGINUID:
-			result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+			result = audit_uid_comparator(audit_get_loginuid(tsk),
+						      f->op, f->uid);
 			break;
 		case AUDIT_LOGINUID_SET:
 			result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -2278,14 +2283,15 @@ int audit_signal_info(int sig, struct task_struct *t)
 {
 	struct audit_aux_data_pids *axp;
 	struct audit_context *ctx = audit_context();
-	kuid_t uid = current_uid(), t_uid = task_uid(t);
+	kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
 
 	if (auditd_test_task(t) &&
 	    (sig == SIGTERM || sig == SIGHUP ||
 	     sig == SIGUSR1 || sig == SIGUSR2)) {
 		audit_sig_pid = task_tgid_nr(current);
-		if (uid_valid(current->loginuid))
-			audit_sig_uid = current->loginuid;
+		auid = audit_get_loginuid(current);
+		if (uid_valid(auid))
+			audit_sig_uid = auid;
 		else
 			audit_sig_uid = uid;
 		security_task_getsecid(current, &audit_sig_sid);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH ghak81 V3 3/3] audit: collect audit task parameters
From: Richard Guy Briggs @ 2018-05-16 11:55 UTC (permalink / raw)
  To: Linux-Audit Mailing List, LKML,
	Linux NetDev Upstream Mailing List, Netfilter Devel List,
	Linux Security Module list, Integrity Measurement Architecture,
	SElinux list
  Cc: Richard Guy Briggs, David Howells, Ingo Molnar
In-Reply-To: <cover.1526430313.git.rgb@redhat.com>

The audit-related parameters in struct task_struct should ideally be
collected together and accessed through a standard audit API.

Collect the existing loginuid, sessionid and audit_context together in a
new struct audit_task_info called "audit" in struct task_struct.

Use kmem_cache to manage this pool of memory.
Un-inline audit_free() to be able to always recover that memory.

See: https://github.com/linux-audit/audit-kernel/issues/81

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h | 34 ++++++++++++++++++++++++----------
 include/linux/sched.h |  5 +----
 init/init_task.c      |  3 +--
 init/main.c           |  2 ++
 kernel/auditsc.c      | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/fork.c         |  2 +-
 6 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 69c7847..4f824c4 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -216,8 +216,15 @@ static inline void audit_log_task_info(struct audit_buffer *ab,
 
 /* These are defined in auditsc.c */
 				/* Public API */
+struct audit_task_info {
+	kuid_t			loginuid;
+	unsigned int		sessionid;
+	struct audit_context	*ctx;
+};
+extern struct audit_task_info init_struct_audit;
+extern void __init audit_task_init(void);
 extern int  audit_alloc(struct task_struct *task);
-extern void __audit_free(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
@@ -239,12 +246,15 @@ extern void audit_seccomp_actions_logged(const char *names,
 
 static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
 {
-	task->audit_context = ctx;
+	task->audit->ctx = ctx;
 }
 
 static inline struct audit_context *audit_context(void)
 {
-	return current->audit_context;
+	if (current->audit)
+		return current->audit->ctx;
+	else
+		return NULL;
 }
 
 static inline bool audit_dummy_context(void)
@@ -252,11 +262,7 @@ static inline bool audit_dummy_context(void)
 	void *p = audit_context();
 	return !p || *(int *)p;
 }
-static inline void audit_free(struct task_struct *task)
-{
-	if (unlikely(task->audit_context))
-		__audit_free(task);
-}
+
 static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
@@ -328,12 +334,18 @@ extern int auditsc_get_stamp(struct audit_context *ctx,
 
 static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
 {
-	return tsk->loginuid;
+	if (tsk->audit)
+		return tsk->audit->loginuid;
+	else
+		return INVALID_UID;
 }
 
 static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
-	return tsk->sessionid;
+	if (tsk->audit)
+		return tsk->audit->sessionid;
+	else
+		return AUDIT_SID_UNSET;
 }
 
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
@@ -458,6 +470,8 @@ static inline void audit_fanotify(unsigned int response)
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
+static inline void __init audit_task_init(void)
+{ }
 static inline int audit_alloc(struct task_struct *task)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3d697f..6a5db0e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,7 +29,6 @@
 #include <linux/task_io_accounting.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
-struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
@@ -832,10 +831,8 @@ struct task_struct {
 
 	struct callback_head		*task_works;
 
-	struct audit_context		*audit_context;
 #ifdef CONFIG_AUDITSYSCALL
-	kuid_t				loginuid;
-	unsigned int			sessionid;
+	struct audit_task_info		*audit;
 #endif
 	struct seccomp			seccomp;
 
diff --git a/init/init_task.c b/init/init_task.c
index 74f60ba..4058840 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -119,8 +119,7 @@ struct task_struct init_task
 	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
 #ifdef CONFIG_AUDITSYSCALL
-	.loginuid	= INVALID_UID,
-	.sessionid	= AUDIT_SID_UNSET,
+	.audit		= &init_struct_audit,
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
diff --git a/init/main.c b/init/main.c
index b795aa3..7ca3dfb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -91,6 +91,7 @@
 #include <linux/cache.h>
 #include <linux/rodata_test.h>
 #include <linux/jump_label.h>
+#include <linux/audit.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -720,6 +721,7 @@ asmlinkage __visible void __init start_kernel(void)
 	nsfs_init();
 	cpuset_init();
 	cgroup_init();
+	audit_task_init();
 	taskstats_init_early();
 	delayacct_init();
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ef3e189..4b1138a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -841,7 +841,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk,
 						      int return_valid,
 						      long return_code)
 {
-	struct audit_context *context = tsk->audit_context;
+	struct audit_context *context = tsk->audit->ctx;
 
 	if (!context)
 		return NULL;
@@ -926,6 +926,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 	return context;
 }
 
+static struct kmem_cache *audit_task_cache;
+
+void __init audit_task_init(void)
+{
+	audit_task_cache = kmem_cache_create("audit_task",
+					     sizeof(struct audit_task_info),
+					     0, SLAB_PANIC, NULL);
+}
+
 /**
  * audit_alloc - allocate an audit context block for a task
  * @tsk: task
@@ -940,17 +949,28 @@ int audit_alloc(struct task_struct *tsk)
 	struct audit_context *context;
 	enum audit_state     state;
 	char *key = NULL;
+	struct audit_task_info *info;
+
+	info = kmem_cache_zalloc(audit_task_cache, GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	info->loginuid = audit_get_loginuid(current);
+	info->sessionid = audit_get_sessionid(current);
+	tsk->audit = info;
 
 	if (likely(!audit_ever_enabled))
 		return 0; /* Return if not auditing. */
 
 	state = audit_filter_task(tsk, &key);
 	if (state == AUDIT_DISABLED) {
+		audit_set_context(tsk, NULL);
 		clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
 		return 0;
 	}
 
 	if (!(context = audit_alloc_context(state))) {
+		tsk->audit = NULL;
+		kmem_cache_free(audit_task_cache, info);
 		kfree(key);
 		audit_log_lost("out of memory in audit_alloc");
 		return -ENOMEM;
@@ -962,6 +982,12 @@ int audit_alloc(struct task_struct *tsk)
 	return 0;
 }
 
+struct audit_task_info init_struct_audit = {
+	.loginuid = INVALID_UID,
+	.sessionid = AUDIT_SID_UNSET,
+	.ctx = NULL,
+};
+
 static inline void audit_free_context(struct audit_context *context)
 {
 	audit_free_names(context);
@@ -1469,26 +1495,33 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 }
 
 /**
- * __audit_free - free a per-task audit context
+ * audit_free - free a per-task audit context
  * @tsk: task whose audit context block to free
  *
  * Called from copy_process and do_exit
  */
-void __audit_free(struct task_struct *tsk)
+void audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
+	struct audit_task_info *info;
 
 	context = audit_take_context(tsk, 0, 0);
-	if (!context)
-		return;
-
 	/* Check for system calls that do not go through the exit
 	 * function (e.g., exit_group), then free context block.
 	 * We use GFP_ATOMIC here because we might be doing this
 	 * in the context of the idle thread */
 	/* that can happen only if we are called from do_exit() */
-	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
+	if (context && context->in_syscall &&
+	    context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
+	/* Freeing the audit_task_info struct must be performed after
+	 * audit_log_exit() due to need for loginuid and sessionid.
+	 */
+	info = tsk->audit;
+	tsk->audit = NULL;
+	kmem_cache_free(audit_task_cache, info);
+	if (!context)
+		return;
 	if (!list_empty(&context->killed_trees))
 		audit_kill_trees(&context->killed_trees);
 
@@ -2071,8 +2104,8 @@ int audit_set_loginuid(kuid_t loginuid)
 			sessionid = (unsigned int)atomic_inc_return(&session_id);
 	}
 
-	task->sessionid = sessionid;
-	task->loginuid = loginuid;
+	task->audit->sessionid = sessionid;
+	task->audit->loginuid = loginuid;
 out:
 	audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
 	return rc;
diff --git a/kernel/fork.c b/kernel/fork.c
index cd18448..92ab849 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1713,7 +1713,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
-	audit_set_context(p, NULL);
+	p->audit = NULL;
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
-- 
1.8.3.1

^ permalink raw reply related

* RE: Hangs in r8152 connected to power management in kernels at least up v4.17-rc4
From: Hayes Wang @ 2018-05-16 12:07 UTC (permalink / raw)
  To: Oliver Neukum; +Cc: netdev@vger.kernel.org, jslaby@suse.com
In-Reply-To: <1526465391.25281.7.camel@suse.com>

Oliver Neukum [mailto:oneukum@suse.com]
> Sent: Wednesday, May 16, 2018 6:10 PM
[...]
> > Besides, I find a similar issue as following.
> > https://www.spinics.net/lists/netdev/msg493512.html
> 
> Well, if we have an imbalance in NAPI it should strike whereever
> it is used, not just in suspend(). Is there debugging for NAPI
> we could activate?

No. The driver doesn't embed such debugging about it.
And I don't find an imbalance between napi_disable() and napi_enable().


Best Regards,
Hayes




^ permalink raw reply

* Re: [PATCH net-next] ath10k: Remove useless test before clk_disable_unprepare
From: YueHaibing @ 2018-05-16 12:10 UTC (permalink / raw)
  To: Kalle Valo; +Cc: davem, netdev, linux-wireless
In-Reply-To: <87d0xvyggc.fsf@purkki.adurom.net>

On 2018/5/16 19:42, Kalle Valo wrote:
> YueHaibing <yuehaibing@huawei.com> writes:
> 
>> clk_disable_unprepare() already checks that the clock pointer is valid.
>> No need to test it before calling it.
>>
>> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> 
> Just a note that ath10k patches are applied to my ath.git tree, not to
> net-next. So adding net-next to the subject is wrong, but no need to
> resend just because of that.
> 

OK, got it, thank you

^ permalink raw reply

* Re: INFO: rcu detected stall in sctp_packet_transmit
From: Dmitry Vyukov @ 2018-05-16 12:12 UTC (permalink / raw)
  To: Xin Long
  Cc: syzbot, davem, LKML, linux-sctp, Marcelo Ricardo Leitner,
	network dev, Neil Horman, syzkaller-bugs, Vlad Yasevich
In-Reply-To: <CADvbK_ffAG5GC-UBX+615P9nz43aBHsy3s0rH3NYs+FhHEwT9w@mail.gmail.com>

On Wed, May 16, 2018 at 1:02 PM, Xin Long <lucien.xin@gmail.com> wrote:
>>> <syzbot+ff0b569fb5111dcd1a36@syzkaller.appspotmail.com> wrote:
>>>> Hello,
>>>>
>>>> syzbot found the following crash on:
>>>>
>>>> HEAD commit:    961423f9fcbc Merge branch 'sctp-Introduce-sctp_flush_ctx'
>>>> git tree:       net-next
>>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1366aea7800000
>>>> kernel config:  https://syzkaller.appspot.com/x/.config?x=51fb0a6913f757db
>>>> dashboard link: https://syzkaller.appspot.com/bug?extid=ff0b569fb5111dcd1a36
>>>> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
>>>>
>>>> Unfortunately, I don't have any reproducer for this crash yet.
>>>>
>>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
>>>> Reported-by: syzbot+ff0b569fb5111dcd1a36@syzkaller.appspotmail.com
>>>>
>>>> INFO: rcu_sched self-detected stall on CPU
>>>>         0-....: (1 GPs behind) idle=dae/1/4611686018427387908
>>>> softirq=93090/93091 fqs=30902
>>>>          (t=125000 jiffies g=51107 c=51106 q=972)
>>>> NMI backtrace for cpu 0
>>>> CPU: 0 PID: 24668 Comm: syz-executor6 Not tainted 4.17.0-rc4+ #44
>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>>>> Google 01/01/2011
>>>> Call Trace:
>>>>  <IRQ>
>>>>  __dump_stack lib/dump_stack.c:77 [inline]
>>>>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>>>>  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
>>>>  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
>>>>  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
>>>>  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
>>>>  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
>>>>  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
>>>>  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
>>>>  __rcu_pending kernel/rcu/tree.c:3356 [inline]
>>>>  rcu_pending kernel/rcu/tree.c:3401 [inline]
>>>>  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
>>>>  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
>>>>  tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:164
>>>>  tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1274
>>>>  __run_hrtimer kernel/time/hrtimer.c:1398 [inline]
>>>>  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1460
>>>>  hrtimer_interrupt+0x2f3/0x750 kernel/time/hrtimer.c:1518
>>>>  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
>>>>  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
>>>>  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
>>>> RIP: 0010:sctp_v6_xmit+0x259/0x6b0 net/sctp/ipv6.c:219
>>>> RSP: 0018:ffff8801dae068e8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
>>>> RAX: 0000000000000007 RBX: ffff8801bb7ec800 RCX: ffffffff86f1b345
>>>> RDX: 0000000000000000 RSI: ffffffff86f1b381 RDI: ffff8801b73d97c4
>>>> RBP: ffff8801dae06988 R08: ffff88019505c300 R09: ffffed003b5c46c2
>>>> R10: ffffed003b5c46c2 R11: ffff8801dae23613 R12: ffff88011fd57300
>>>> R13: ffff8801bb7ecec8 R14: 0000000000000029 R15: 0000000000000002
>>>>  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:642
>>>>  sctp_outq_flush_transports net/sctp/outqueue.c:1164 [inline]
>>>>  sctp_outq_flush+0x5f5/0x3430 net/sctp/outqueue.c:1212
>>>>  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
>>>>  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
>>>>  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>>>>  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
>>>>  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
>>> Shocks, this timer event again. Can we try to minimize the repo.syz and
>>> get a short script, not neccessary to reproduce the issue 100%. we need
>>> to know what it was doing when this happened.
>>>
>>> Thanks.
>>
>> It's possible to reply the whole log from console output following
>> these instructions:
>> https://github.com/google/syzkaller/blob/master/docs/executing_syzkaller_programs.md
> Thanks, it's running now.
> Usually how long will it take to finish running this 5000-line log?

If you run with -repeat=0 then it will run infinitely repeating the
log again and again. If you see:

parsed 1000 programs
...
executed 5000 programs

then it looped 5 times already. You can run with -repeat=10.

syzbot has tried replaying the log, but for some reason it wasn't able
to reproduce the crash (maybe accumulated state, or maybe it crashed
in a different way). You can also try logs from other sctp hangs.

^ permalink raw reply

* Re: [RFC v4 4/5] virtio_ring: add event idx support in packed ring
From: Jason Wang @ 2018-05-16 12:17 UTC (permalink / raw)
  To: Tiwei Bie, mst, virtualization, linux-kernel, netdev; +Cc: wexu, jfreimann
In-Reply-To: <20180516083737.26504-5-tiwei.bie@intel.com>



On 2018年05月16日 16:37, Tiwei Bie wrote:
> This commit introduces the event idx support in
> packed ring.
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c | 75 +++++++++++++++++++++++++++++++++---
>   1 file changed, 70 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c6c5deb0e3ae..de3839f3621a 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1006,7 +1006,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>   static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 flags;
> +	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
>   	bool needs_kick;
>   	u32 snapshot;
>   
> @@ -1015,9 +1015,19 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   	 * suppressions. */
>   	virtio_mb(vq->weak_barriers);
>   
> +	old = vq->next_avail_idx - vq->num_added;
> +	new = vq->next_avail_idx;
> +	vq->num_added = 0;
> +
>   	snapshot = *(u32 *)vq->vring_packed.device;
> +	off_wrap = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot & 0xffff));
>   	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
>   
> +	wrap_counter = off_wrap >> 15;
> +	event_idx = off_wrap & ~(1<<15);
> +	if (wrap_counter != vq->wrap_counter)
> +		event_idx -= vq->vring_packed.num;
> +
>   #ifdef DEBUG
>   	if (vq->last_add_time_valid) {
>   		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> @@ -1026,7 +1036,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   	vq->last_add_time_valid = false;
>   #endif
>   
> -	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	if (flags == VRING_EVENT_F_DESC)
> +		needs_kick = vring_need_event(event_idx, new, old);
> +	else
> +		needs_kick = (flags != VRING_EVENT_F_DISABLE);
>   	END_USE(vq);
>   	return needs_kick;
>   }
> @@ -1098,7 +1111,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>   					  void **ctx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 last_used, id;
> +	u16 wrap_counter, last_used, id;
>   	void *ret;
>   
>   	START_USE(vq);
> @@ -1138,6 +1151,19 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>   	ret = vq->desc_state[id].data;
>   	detach_buf_packed(vq, last_used, id, ctx);
>   
> +	wrap_counter = vq->wrap_counter;
> +	if (vq->last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> +		virtio_store_mb(vq->weak_barriers,
> +				&vq->vring_packed.driver->off_wrap,
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> +						(wrap_counter << 15)));
> +
>   #ifdef DEBUG
>   	vq->last_add_time_valid = false;
>   #endif
> @@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
>   static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 wrap_counter;
>   
>   	START_USE(vq);
>   
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (vq->last_used_idx > vq->next_avail_idx)

Should this be ">=" consider rx refill may try to completely fill the ring?

> +		wrap_counter ^= 1;
> +
> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +			vq->last_used_idx | (wrap_counter << 15));
>   
>   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>   		virtio_wmb(vq->weak_barriers);
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>   							vq->event_flags_shadow);
>   	}
> @@ -1194,15 +1232,40 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
>   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs, used_idx, wrap_counter;
>   
>   	START_USE(vq);
>   
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +
> +	/* TODO: tune this threshold */
> +	if (vq->next_avail_idx < vq->last_used_idx)
> +		bufs = (vq->vring_packed.num + vq->next_avail_idx -
> +				vq->last_used_idx) * 3 / 4;
> +	else
> +		bufs = (vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> +
> +	wrap_counter = vq->wrap_counter;
> +	if (vq->last_used_idx > vq->next_avail_idx)
> +		wrap_counter ^= 1;
> +
> +	used_idx = vq->last_used_idx + bufs;
> +	if (used_idx >= vq->vring_packed.num) {
> +		used_idx -= vq->vring_packed.num;
> +		wrap_counter ^= 1;
> +	}

Looks correct but maybe it's better to add some comments for such logic.

> +
> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +			used_idx | (wrap_counter << 15));
>   
>   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>   		virtio_wmb(vq->weak_barriers);
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>   							vq->event_flags_shadow);
>   	}
> @@ -1869,8 +1932,10 @@ void vring_transport_features(struct virtio_device *vdev)
>   		switch (i) {
>   		case VIRTIO_RING_F_INDIRECT_DESC:
>   			break;
> +#if 0
>   		case VIRTIO_RING_F_EVENT_IDX:
>   			break;
> +#endif

Maybe it's time to remove this #if 0.

Thanks

>   		case VIRTIO_F_VERSION_1:
>   			break;
>   		case VIRTIO_F_IOMMU_PLATFORM:

^ permalink raw reply

* Re: [PATCH net-next] net:sched: add gkprio scheduler
From: Jamal Hadi Salim @ 2018-05-16 12:18 UTC (permalink / raw)
  To: Michel Machado, Cong Wang
  Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
	Linux Kernel Network Developers, Cody Doucette
In-Reply-To: <d98eacbb-6fec-c8cf-dd88-92081c550e2f@digirati.com.br>

Sorry I dropped this.

On 14/05/18 10:08 AM, Michel Machado wrote:
>> On 09/05/18 01:37 PM, Michel Machado wrote:

> 
> A simplified description of what DSprio is meant to do is as follows: 
> when a link is overloaded at a router, DSprio makes this router drop the 
> packets of lower priority.

Makes sense. Any priority based work-conserving scheduler will work
fine. The only small difference you have with prio qdisc is you
drop an enqueued low prio packet to make room for a new higher prio
queue. Can you look at pfifo_head_drop qdisc to see if it suffices? It
may not be: In such a case, I would suggest a hybrid between
pfifo_head_drop and pfifo_fast for the new qdisc.
[Cong has suggested to write a classful qdisc but it may be sufficient
to just replicate what pfifo_fast does since it tracks virtual queues]

> These priorities are assigned by Gatekeeper 
> in such a way that well behaving sources are favored (Theorem 4.1 of the 
> Portcullis paper pointed out in my previous email). Moreover, attackers 
> cannot do much better than well behaving sources (Theorem 4.2). This 
> description is simplified because it omits many other components of 
> Gatekeeper that affects the packets that goes to DSprio.
> 

I am sorry - I have no access to this document so dont know what these
theorems are. I understand your requirements. 1) You are looking to use
priority identifiers to select queues. 2) You want to prioritize
treatment of favorably tagged packets. The enqueueing will drop
lower priority packets to make space for higher priority under
congestion. Did i miss anything?
For #1 my suggestion is to use skbmod to set the priority tag.
For #2 if you didnt have to drop at enqueue time you could have
used any of the existing priority favoring qdiscs which recognize
skb->priority. Otherwise as i suggested above look at
pfifo_fast/pfifo_head_drop

> Like you, I'm all in for less code. If someone can instruct us on how to 
> accomplish the same thing that our patch is doing, we would be happy to 
> withdraw it. We have submitted this patch because we want to lower the 
> bar to deploy Gatekeeper as much as possible, and requiring network 
> operators willing to deploy Gatekeeper to keep patching the kernel is an 
> operational burden.
> 

So I would suggest you keep this real simple - especially if you want to
go backwards in kernels. For existing kernels you can implement the
basic policies of what you need by using prio qdisc with a combination
of a classifier that knows how to match on dsfield (trivial to do with
u32) and skbedit action to tag the skb->priority. Then let prio qdisc
use the priomap to select the queue.
If you must drop enqueued low prio packets then you may need the new
qdisc. And to optimize, you will need the skbmod change.
I really think it is a bad idea to encapsulate the classifier in the
qdisc.

>> Look at the priomap or prio2band arrangement on prio qdisc
>> or pfifo_fast qdisc. You take an skbprio as an index into the array
>> and retrieve a queue to enqueue to. The size of the array is 16.
>> In the past this was based IIRC on ip precedence + 1 bit. Those map
>> similarly to DS fields (calls selectors, assured forwarding etc). So
>> no need to even increase the array beyond current 16.
> 
> What application is this change supposed to enable or help? I think this 
> change should be left for when one can explain the need for it.
> 

I meant to take a look at the prio map. It is an array of size 16 which
holds the skb->priority implicit classifier (prio, pfifo_fast etc).
A packets skb priority is used as an index into this array and from the 
result a queue is selected to put the packet onto.
The map of this array can be configured from user space. I was saying
earlier that it may be tempting to make a size 64 array to map the
possible dsfields - in practise that has never been pragmatic (so 16 was
sufficient).

cheers,
jamal

^ permalink raw reply

* Re: [PATCH 12/14] net: sched: retry action check-insert on concurrent modification
From: Jiri Pirko @ 2018-05-16 12:26 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <vbf4lj724th.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

Wed, May 16, 2018 at 01:55:06PM CEST, vladbu@mellanox.com wrote:
>
>On Wed 16 May 2018 at 09:59, Jiri Pirko <jiri@resnulli.us> wrote:
>> Mon, May 14, 2018 at 04:27:13PM CEST, vladbu@mellanox.com wrote:
>>>Retry check-insert sequence in action init functions if action with same
>>>index was inserted concurrently.
>>>
>>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>>---
>>> net/sched/act_bpf.c        | 8 +++++++-
>>> net/sched/act_connmark.c   | 8 +++++++-
>>> net/sched/act_csum.c       | 8 +++++++-
>>> net/sched/act_gact.c       | 8 +++++++-
>>> net/sched/act_ife.c        | 8 +++++++-
>>> net/sched/act_ipt.c        | 8 +++++++-
>>> net/sched/act_mirred.c     | 8 +++++++-
>>> net/sched/act_nat.c        | 8 +++++++-
>>> net/sched/act_pedit.c      | 8 +++++++-
>>> net/sched/act_police.c     | 9 ++++++++-
>>> net/sched/act_sample.c     | 8 +++++++-
>>> net/sched/act_simple.c     | 9 ++++++++-
>>> net/sched/act_skbedit.c    | 8 +++++++-
>>> net/sched/act_skbmod.c     | 8 +++++++-
>>> net/sched/act_tunnel_key.c | 9 ++++++++-
>>> net/sched/act_vlan.c       | 9 ++++++++-
>>> 16 files changed, 116 insertions(+), 16 deletions(-)
>>>
>>>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>>>index 5554bf7..7e20fdc 100644
>>>--- a/net/sched/act_bpf.c
>>>+++ b/net/sched/act_bpf.c
>>>@@ -299,10 +299,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
>>> 
>>> 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
>>> 
>>>+replay:
>>> 	if (!tcf_idr_check(tn, parm->index, act, bind)) {
>>> 		ret = tcf_idr_create(tn, parm->index, est, act,
>>> 				     &act_bpf_ops, bind, true);
>>>-		if (ret < 0)
>>>+		/* Action with specified index was created concurrently.
>>>+		 * Check again.
>>>+		 */
>>>+		if (parm->index && ret == -ENOSPC)
>>>+			goto replay;
>>>+		else if (ret)
>>
>> Hmm, looks like you are doing the same/very similar thing in every act
>> code. I think it would make sense to introduce a helper function for
>> this purpose.
>
>This code uses goto so it can't be easily refactored into standalone
>function. Could you specify which part of this code you suggest to
>extract?

Hmm, looking at the code, I think that what would help is to have a
helper that would atomically check if index exists and if not, it would
allocate one. Something like:


int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
			struct tc_action **a, int bind)
{
	struct tcf_idrinfo *idrinfo = tn->idrinfo;
	struct tc_action *p;
	int err;

	spin_lock(&idrinfo->lock);
	if (*index) {
		p = idr_find(&idrinfo->action_idr, *index);
		if (p) {
			if (bind)
	   			p->tcfa_bindcnt++;
			p->tcfa_refcnt++;
			*a = p;
			err = 0;
		} else {
			*a = NULL;
			err = idr_alloc_u32(idr, NULL, index,
					    *index, GFP_ATOMIC);
		}
	} else {
		*index = 1;
		*a = NULL;
		err = idr_alloc_u32(idr, NULL, index, UINT_MAX, GFP_ATOMIC);
	}
	spin_unlock(&idrinfo->lock);
	return err;
}

The act code would just check if "a" is NULL and if so, it would call
tcf_idr_create() with allocated index as arg.


>
>>
>> [...]
>

^ permalink raw reply

* Re: [RFC v4 5/5] virtio_ring: enable packed ring
From: Tiwei Bie @ 2018-05-16 12:26 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: mst, jasowang, virtualization, linux-kernel, netdev, wexu,
	jfreimann
In-Reply-To: <dc6f6d01-7ca5-5264-6a58-1d5327da08cb@cogentembedded.com>

On Wed, May 16, 2018 at 02:42:53PM +0300, Sergei Shtylyov wrote:
> On 05/16/2018 01:21 PM, Tiwei Bie wrote:
> 
> >>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> >>> ---
> >>>   drivers/virtio/virtio_ring.c | 2 ++
> >>>   1 file changed, 2 insertions(+)
> >>>
> >>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >>> index de3839f3621a..b158692263b0 100644
> >>> --- a/drivers/virtio/virtio_ring.c
> >>> +++ b/drivers/virtio/virtio_ring.c
> >>> @@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
> >>>   			break;
> >>>   		case VIRTIO_F_IOMMU_PLATFORM:
> >>>   			break;
> >>> +		case VIRTIO_F_RING_PACKED:
> >>> +			break;
> >>
> >>    Why not just add this *case* under the previous *case*?
> > 
> > Do you mean fallthrough? Something like:
> > 
> > 		case VIRTIO_F_IOMMU_PLATFORM:
> > 		case VIRTIO_F_RING_PACKED:
> > 			break;
> 
>    Yes, exactly. :-)

Using fallthrough in this case will make the code more
compact. I like such coding style. But unfortunately,
it's not consistent with the existing code. :(

The whole function will become something like this:

void vring_transport_features(struct virtio_device *vdev)
{
	unsigned int i;

	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
		switch (i) {
		case VIRTIO_RING_F_INDIRECT_DESC:
			break;
		case VIRTIO_RING_F_EVENT_IDX:
			break;
		case VIRTIO_F_VERSION_1:
			break;
		case VIRTIO_F_IOMMU_PLATFORM:
		case VIRTIO_F_RING_PACKED:
			break;
		default:
			/* We don't understand this bit. */
			__virtio_clear_bit(vdev, i);
		}
	}
}

Best regards,
Tiwei Bie

> 
> > Best regards,
> > Tiwei Bie
> 
> [...]
> 
> MBR, Sergei
> 

^ permalink raw reply

* Re: [PATCH net-next v2 0/2] of: mdio: Fall back to mdiobus_register() with NULL device_node
From: Andrew Lunn @ 2018-05-16 12:27 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Florian Fainelli, netdev, Vivien Didelot, David S. Miller,
	Nicolas Ferre, Fugang Duan, Sergei Shtylyov, Giuseppe Cavallaro,
	Alexandre Torgue, Jose Abreu, Grygorii Strashko, Woojung Huh,
	Microchip Linux Driver Support, Rob Herring, Frank Rowand,
	Antoine Tenart, Tobias Jordan, Russell 
In-Reply-To: <CAMuHMdXfOOgADe1my9Y3s3b9_OsXPjbqq1PPFR9izZWhtPuaFg@mail.gmail.com>

On Wed, May 16, 2018 at 10:54:12AM +0200, Geert Uytterhoeven wrote:
> Hi Florian,
> 
> Thanks for your series!
> I like the effect on simplifying drivers.
> 
> On Wed, May 16, 2018 at 1:56 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> > This patch series updates of_mdiobus_register() such that when the device_node
> > argument is NULL, it calls mdiobus_register() directly. This is consistent with
> > the behavior of of_mdiobus_register() when CONFIG_OF=n.
> 
> IMHO the CONFIG_OF=n behavior of of_mdiobus_register() (which I wasn't
> aware of) is inconsistent with the behavior of other of_*() functions,
> which are just empty stubs.
> 
> So I'm wondering if you should do it the other way around, and let
> mdiobus_register() call of_mdiobus_register() if dev->of_node exists?

Hi Geert

dev->of_node is often not the correct OF node. The mdio properties are
often embedded inside a MAC driver, and use an 'mdio' container
node. This container node is needed, not the device node.

> I haven't looked at the ACPI handling, but perhaps this can be moved
> inside mdiobus_register() as well?

The ACPI binding for MDIO and PHYs has not been defined yet.

    Andrew

^ permalink raw reply

* [RFC V4 PATCH 0/8] Packed ring layout for vhost
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie

Hi all:

This RFC implement packed ring layout. The code were tested with
Tiwei's RFC V3 ahttps://lkml.org/lkml/2018/4/25/34. Some fixups and
tweaks were needed on top of Tiwei's code to make it run for event
index.

Pktgen reports about 20% improvement on PPS (event index is off). More
testing is ongoing.

Notes for tester:

- Start from this version, vhost need qemu co-operation to work
  correctly. Or you can comment out the packed specific code for
  GET/SET_VRING_BASE.

Changes from V3:
- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from V2:
- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from V1:

- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (8):
  vhost: move get_rx_bufs to vhost.c
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost_net: do not explicitly manipulate vhost_used_elem
  vhost: vhost_put_user() can accept metadata type
  virtio: introduce packed ring defines
  vhost: packed ring support
  vhost: event suppression for packed ring

 drivers/vhost/net.c                | 136 ++----
 drivers/vhost/scsi.c               |  62 +--
 drivers/vhost/vhost.c              | 861 ++++++++++++++++++++++++++++++++-----
 drivers/vhost/vhost.h              |  47 +-
 drivers/vhost/vsock.c              |  42 +-
 include/uapi/linux/virtio_config.h |   9 +
 include/uapi/linux/virtio_ring.h   |  32 ++
 7 files changed, 928 insertions(+), 261 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [RFC V4 PATCH 1/8] vhost: move get_rx_bufs to vhost.c
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 83 ++-------------------------------------------------
 drivers/vhost/vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |  7 +++++
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a..762aa81 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -664,83 +664,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 	return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- *	vq has read descriptors only.
- * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- * @quota       - headcount quota, 1 for big buffer
- *	returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-		       struct vring_used_elem *heads,
-		       int datalen,
-		       unsigned *iovcount,
-		       struct vhost_log *log,
-		       unsigned *log_num,
-		       unsigned int quota)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
-	/* len is always initialized before use since we are always called with
-	 * datalen > 0.
-	 */
-	u32 uninitialized_var(len);
-
-	while (datalen > 0 && headcount < quota) {
-		if (unlikely(seg >= UIO_MAXIOV)) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
-				      ARRAY_SIZE(vq->iov) - seg, &out,
-				      &in, log, log_num);
-		if (unlikely(r < 0))
-			goto err;
-
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (unlikely(out || in <= 0)) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		if (unlikely(log)) {
-			nlogs += *log_num;
-			log += *log_num;
-		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
-		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
-		datalen -= len;
-		++headcount;
-		seg += in;
-	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-	*iovcount = seg;
-	if (unlikely(log))
-		*log_num = nlogs;
-
-	/* Detect overrun */
-	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
-		goto err;
-	}
-	return headcount;
-err:
-	vhost_discard_vq_desc(vq, headcount);
-	return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -790,9 +713,9 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len,
-					&in, vq_log, &log,
-					likely(mergeable) ? UIO_MAXIOV : 1);
+		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+					   &in, vq_log, &log,
+					   likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
 			goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e9..6b455f6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2097,6 +2097,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ * @quota       - headcount quota, 1 for big buffer
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+	/* len is always initialized before use since we are always called with
+	 * datalen > 0.
+	 */
+	u32 uninitialized_var(len);
+
+	while (datalen > 0 && headcount < quota) {
+		if (unlikely(seg >= UIO_MAXIOV)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		r = vhost_get_vq_desc(vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (unlikely(r < 0))
+			goto err;
+
+		d = r;
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = cpu_to_vhost32(vq, d);
+		len = iov_length(vq->iov + seg, in);
+		heads[headcount].len = cpu_to_vhost32(vq, len);
+		datalen -= len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+
+	/* Detect overrun */
+	if (unlikely(datalen > 0)) {
+		r = UIO_MAXIOV + 1;
+		goto err;
+	}
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+EXPORT_SYMBOL_GPL(vhost_get_bufs);
+
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..52edd242 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -185,6 +185,13 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *heads,
+		   int datalen,
+		   unsigned *iovcount,
+		   struct vhost_log *log,
+		   unsigned *log_num,
+		   unsigned int quota);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 2/8] vhost: hide used ring layout from device
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 46 +++++++++++++++++++++-----------------
 drivers/vhost/scsi.c  | 62 +++++++++++++++++++++++++++------------------------
 drivers/vhost/vhost.c | 52 +++++++++++++++++++++---------------------
 drivers/vhost/vhost.h |  9 +++++---
 drivers/vhost/vsock.c | 42 +++++++++++++++++-----------------
 5 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 762aa81..826489c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -426,22 +426,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
+				    struct vring_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
 	unsigned long uninitialized_var(endtime);
-	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+	int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
 				  out_num, in_num, NULL, NULL);
 
-	if (r == vq->num && vq->busyloop_timeout) {
+	if (r == -ENOSPC && vq->busyloop_timeout) {
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(vq->dev, endtime) &&
 		       vhost_vq_avail_empty(vq->dev, vq))
 			cpu_relax();
 		preempt_enable();
-		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-				      out_num, in_num, NULL, NULL);
+		r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+				      ARRAY_SIZE(vq->iov), out_num, in_num,
+				      NULL, NULL);
 	}
 
 	return r;
@@ -463,7 +465,6 @@ static void handle_tx(struct vhost_net *net)
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in;
-	int head;
 	struct msghdr msg = {
 		.msg_name = NULL,
 		.msg_namelen = 0,
@@ -476,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+	struct vring_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -499,20 +501,20 @@ static void handle_tx(struct vhost_net *net)
 			vhost_zerocopy_signal_used(net, vq);
 
 
-		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-						ARRAY_SIZE(vq->iov),
-						&out, &in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+		err = vhost_net_tx_get_vq_desc(net, vq, &used, vq->iov,
+					       ARRAY_SIZE(vq->iov),
+					       &out, &in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			break;
 		if (in) {
 			vq_err(vq, "Unexpected descriptor format for TX: "
 			       "out %d, int %d\n", out, in);
@@ -540,7 +542,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+			vq->heads[nvq->upend_idx].id =
+				cpu_to_vhost32(vq, used.id);
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
@@ -581,7 +584,7 @@ static void handle_tx(struct vhost_net *net)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
 		if (!zcopy_used)
-			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+			vhost_add_used_and_signal(&net->dev, vq, &used, 0);
 		else
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
@@ -713,14 +716,12 @@ static void handle_rx(struct vhost_net *net)
 	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
-		headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
-					   &in, vq_log, &log,
-					   likely(mergeable) ? UIO_MAXIOV : 1);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(headcount < 0))
-			goto out;
+		err = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+				     &in, vq_log, &log,
+				     likely(mergeable) ? UIO_MAXIOV : 1,
+				     &headcount);
 		/* OK, now we need to know about added descriptors. */
-		if (!headcount) {
+		if (err == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -731,6 +732,9 @@ static void handle_rx(struct vhost_net *net)
 			 * they refilled. */
 			goto out;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(err < 0))
+			goto out;
 		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 7ad5709..654c71f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	int tvc_vq_desc;
+	struct vring_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,8 +441,9 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
+	struct vring_used_elem used;
 	unsigned out, in;
-	int head, ret;
+	int ret;
 
 	if (!vq->private_data) {
 		vs->vs_events_missed = true;
@@ -451,16 +452,16 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 
 again:
 	vhost_disable_notify(&vs->dev, vq);
-	head = vhost_get_vq_desc(vq, vq->iov,
+	ret = vhost_get_vq_desc(vq, &used, vq->iov,
 			ARRAY_SIZE(vq->iov), &out, &in,
 			NULL, NULL);
-	if (head < 0) {
+	if (ret == -ENOSPC) {
+		if (vhost_enable_notify(&vs->dev, vq))
+			goto again;
 		vs->vs_events_missed = true;
 		return;
 	}
-	if (head == vq->num) {
-		if (vhost_enable_notify(&vs->dev, vq))
-			goto again;
+	if (ret < 0) {
 		vs->vs_events_missed = true;
 		return;
 	}
@@ -480,7 +481,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	eventp = vq->iov[out].iov_base;
 	ret = __copy_to_user(eventp, event, sizeof(*event));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, &used, 0);
 	else
 		vq_err(vq, "Faulted on vhost_scsi_send_event\n");
 }
@@ -541,7 +542,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
 		if (likely(ret == sizeof(v_rsp))) {
 			struct vhost_scsi_virtqueue *q;
-			vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+			vhost_add_used(cmd->tvc_vq, &cmd->tvc_vq_used, 0);
 			q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
 			vq = q - vs->vqs;
 			__set_bit(vq, signal);
@@ -784,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   int head, unsigned out)
+			   struct vring_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -795,7 +796,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 	resp = vq->iov[out].iov_base;
 	ret = __copy_to_user(resp, &rsp, sizeof(rsp));
 	if (!ret)
-		vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+		vhost_add_used_and_signal(&vs->dev, vq, used, 0);
 	else
 		pr_err("Faulted on virtio_scsi_cmd_resp\n");
 }
@@ -807,11 +808,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
+	struct vring_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
 	unsigned int out = 0, in = 0;
-	int head, ret, prot_bytes;
+	int ret, prot_bytes;
 	size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp);
 	size_t out_size, in_size;
 	u16 lun;
@@ -831,22 +833,22 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	vhost_disable_notify(&vs->dev, vq);
 
 	for (;;) {
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov), &out, &in,
-					 NULL, NULL);
+		ret = vhost_get_vq_desc(vq, &used, vq->iov,
+					ARRAY_SIZE(vq->iov), &out, &in,
+					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 head, out, in);
-		/* On error, stop handling until the next kick. */
-		if (unlikely(head < 0))
-			break;
+			 used.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (head == vq->num) {
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
 				vhost_disable_notify(&vs->dev, vq);
 				continue;
 			}
 			break;
 		}
+		/* On error, stop handling until the next kick. */
+		if (unlikely(ret < 0))
+			break;
 		/*
 		 * Check for a sane response buffer so we can report early
 		 * errors back to the guest.
@@ -891,20 +893,20 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 
 		if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
 			vq_err(vq, "Faulted on copy_from_iter\n");
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/* virtio-scsi spec requires byte 0 of the lun to be 1 */
 		if (unlikely(*lunp != 1)) {
 			vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 
 		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		/*
@@ -950,7 +952,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_TO_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesout,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout);
@@ -958,7 +961,8 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 				if (data_direction != DMA_FROM_DEVICE) {
 					vq_err(vq, "Received non zero pi_bytesin,"
 						" but wrong data_direction\n");
-					vhost_scsi_send_bad_target(vs, vq, head, out);
+					vhost_scsi_send_bad_target(vs, vq,
+								   &used, out);
 					continue;
 				}
 				prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin);
@@ -996,7 +1000,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			vq_err(vq, "Received SCSI CDB with command_size: %d that"
 				" exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n",
 				scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE);
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr,
@@ -1005,7 +1009,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		if (IS_ERR(cmd)) {
 			vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
 			       PTR_ERR(cmd));
-			vhost_scsi_send_bad_target(vs, vq, head, out);
+			vhost_scsi_send_bad_target(vs, vq, &used, out);
 			continue;
 		}
 		cmd->tvc_vhost = vs;
@@ -1025,7 +1029,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			if (unlikely(ret)) {
 				vq_err(vq, "Failed to map iov to sgl\n");
 				vhost_scsi_release_cmd(&cmd->tvc_se_cmd);
-				vhost_scsi_send_bad_target(vs, vq, head, out);
+				vhost_scsi_send_bad_target(vs, vq, &used, out);
 				continue;
 			}
 		}
@@ -1034,7 +1038,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 * complete the virtio-scsi request in TCM callback context via
 		 * vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
 		 */
-		cmd->tvc_vq_desc = head;
+		cmd->tvc_vq_used = used;
 		/*
 		 * Dispatch cmd descriptor for cmwq execution in process
 		 * context provided by vhost_scsi_workqueue.  This also ensures
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6b455f6..e069adc 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1955,6 +1955,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vring_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -1987,7 +1988,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		 * invalid.
 		 */
 		if (vq->avail_idx == last_avail_idx)
-			return vq->num;
+			return -ENOSPC;
 
 		/* Only get avail ring entries after they have been
 		 * exposed by guest.
@@ -2005,6 +2006,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
+	used->id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2093,10 +2095,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	/* Assume notifications from guest are disabled at this point,
 	 * if they aren't we would need to update avail_event index. */
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
-	return head;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+static void vhost_set_used_len(struct vhost_virtqueue *vq,
+			       struct vring_used_elem *used, int len)
+{
+	used->len = cpu_to_vhost32(vq, len);
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -2113,13 +2121,13 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota)
+		   unsigned int quota,
+		   s16 *count)
 {
 	unsigned int out, in;
 	int seg = 0;
 	int headcount = 0;
-	unsigned d;
-	int r, nlogs = 0;
+	int r = 0, nlogs = 0;
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
@@ -2130,17 +2138,12 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			r = -ENOBUFS;
 			goto err;
 		}
-		r = vhost_get_vq_desc(vq, vq->iov + seg,
+		r = vhost_get_vq_desc(vq, &heads[headcount], vq->iov + seg,
 				      ARRAY_SIZE(vq->iov) - seg, &out,
 				      &in, log, log_num);
 		if (unlikely(r < 0))
 			goto err;
 
-		d = r;
-		if (d == vq->num) {
-			r = 0;
-			goto err;
-		}
 		if (unlikely(out || in <= 0)) {
 			vq_err(vq, "unexpected descriptor format for RX: "
 				"out %d, in %d\n", out, in);
@@ -2151,24 +2154,26 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
-		heads[headcount].id = cpu_to_vhost32(vq, d);
+
 		len = iov_length(vq->iov + seg, in);
-		heads[headcount].len = cpu_to_vhost32(vq, len);
+		vhost_set_used_len(vq, &heads[headcount], len);
 		datalen -= len;
 		++headcount;
 		seg += in;
 	}
-	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+	vhost_set_used_len(vq, &heads[headcount - 1], len + datalen);
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
 
 	/* Detect overrun */
 	if (unlikely(datalen > 0)) {
-		r = UIO_MAXIOV + 1;
+		headcount = UIO_MAXIOV + 1;
 		goto err;
 	}
-	return headcount;
+
+	*count = headcount;
+	return 0;
 err:
 	vhost_discard_vq_desc(vq, headcount);
 	return r;
@@ -2184,14 +2189,11 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+		   int len)
 {
-	struct vring_used_elem heads = {
-		cpu_to_vhost32(vq, head),
-		cpu_to_vhost32(vq, len)
-	};
-
-	return vhost_add_used_n(vq, &heads, 1);
+	vhost_set_used_len(vq, used, len);
+	return vhost_add_used_n(vq, used, 1);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
@@ -2324,9 +2326,9 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       unsigned int head, int len)
+			       struct vring_used_elem *used, int len)
 {
-	vhost_add_used(vq, head, len);
+	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 52edd242..a7cc7e7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -182,6 +182,7 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
+		      struct vring_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
@@ -191,15 +192,17 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
 		   unsigned *log_num,
-		   unsigned int quota);
+		   unsigned int quota,
+		   s16 *count);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
+int vhost_add_used(struct vhost_virtqueue *vq,
+		   struct vring_used_elem *elem, int len);
 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int id, int len);
+			       struct vring_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 34bc3ab..59a01cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,11 +98,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
+		struct vring_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
 		size_t len;
-		int head;
+		int ret;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
 		if (list_empty(&vsock->send_pkt_list)) {
@@ -116,16 +117,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0) {
-			spin_lock_bh(&vsock->send_pkt_list_lock);
-			list_add(&pkt->list, &vsock->send_pkt_list);
-			spin_unlock_bh(&vsock->send_pkt_list_lock);
-			break;
-		}
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			spin_lock_bh(&vsock->send_pkt_list_lock);
 			list_add(&pkt->list, &vsock->send_pkt_list);
 			spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -139,6 +133,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			}
 			break;
 		}
+		if (ret < 0) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+			break;
+		}
 
 		if (out) {
 			virtio_transport_free_pkt(pkt);
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
+		len = vhost32_to_cpu(vq, used.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -163,7 +163,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + pkt->len);
 		added = true;
 
 		if (pkt->reply) {
@@ -346,7 +346,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	int head;
+	struct vring_used_elem used;
+	int ret;
 	unsigned int out, in;
 	bool added = false;
 
@@ -367,18 +368,17 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			goto no_more_replies;
 		}
 
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
+		ret = vhost_get_vq_desc(vq, &used, vq->iov, ARRAY_SIZE(vq->iov),
+					&out, &in, NULL, NULL);
+		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
 				vhost_disable_notify(&vsock->dev, vq);
 				continue;
 			}
 			break;
 		}
+		if (ret < 0)
+			break;
 
 		pkt = vhost_vsock_alloc_pkt(vq, out, in);
 		if (!pkt) {
@@ -397,7 +397,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+		vhost_add_used(vq, &used, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 3/8] vhost: do not use vring_used_elem
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 19 +++++++-------
 drivers/vhost/scsi.c  | 10 ++++----
 drivers/vhost/vhost.c | 68 ++++++++++++---------------------------------------
 drivers/vhost/vhost.h | 18 ++++++++------
 drivers/vhost/vsock.c |  6 ++---
 5 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 826489c..3826f1f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,10 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+		if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+		if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+			vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
 			++j;
 		} else
 			break;
@@ -367,7 +367,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_lock_bh();
 
 	/* set len to mark this desc buffers done DMA */
-	vq->heads[ubuf->desc].len = success ?
+	vq->heads[ubuf->desc].elem.len = success ?
 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 	cnt = vhost_net_ubuf_put(ubufs);
 
@@ -426,7 +426,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
-				    struct vring_used_elem *used_elem,
+				    struct vhost_used_elem *used_elem,
 				    struct iovec iov[], unsigned int iov_size,
 				    unsigned int *out_num, unsigned int *in_num)
 {
@@ -477,7 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	bool zcopy, zcopy_used;
 	int sent_pkts = 0;
 
@@ -542,9 +542,10 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].id =
-				cpu_to_vhost32(vq, used.id);
-			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+			vq->heads[nvq->upend_idx].elem.id =
+				cpu_to_vhost32(vq, used.elem.id);
+			vq->heads[nvq->upend_idx].elem.len =
+				VHOST_DMA_IN_PROGRESS;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 654c71f..ac11412 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-	struct vring_used_elem tvc_vq_used;
+	struct vhost_used_elem tvc_vq_used;
 	/* virtio-scsi initiator task attribute */
 	int tvc_task_attr;
 	/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	unsigned out, in;
 	int ret;
 
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
 			   struct vhost_virtqueue *vq,
-			   struct vring_used_elem *used, unsigned out)
+			   struct vhost_used_elem *used, unsigned out)
 {
 	struct virtio_scsi_cmd_resp __user *resp;
 	struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 	struct virtio_scsi_cmd_req v_req;
 	struct virtio_scsi_cmd_req_pi v_req_pi;
 	struct vhost_scsi_cmd *cmd;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	struct iov_iter out_iter, in_iter, prot_iter, data_iter;
 	u64 tag;
 	u32 exp_data_len, data_direction;
@@ -837,7 +837,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 					ARRAY_SIZE(vq->iov), &out, &in,
 					NULL, NULL);
 		pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
-			 used.id, out, in);
+			 used.elem.id, out, in);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (ret == -ENOSPC) {
 			if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e069adc..afd4119 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -728,41 +728,6 @@ static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 			  struct iovec iov[], int iov_size, int access);
 
-static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
-			      const void *from, unsigned size)
-{
-	int ret;
-
-	if (!vq->iotlb)
-		return __copy_to_user(to, from, size);
-	else {
-		/* This function should be called after iotlb
-		 * prefetch, which means we're sure that all vq
-		 * could be access through iotlb. So -EAGAIN should
-		 * not happen in this case.
-		 */
-		struct iov_iter t;
-		void __user *uaddr = vhost_vq_meta_fetch(vq,
-				     (u64)(uintptr_t)to, size,
-				     VHOST_ADDR_USED);
-
-		if (uaddr)
-			return __copy_to_user(uaddr, from, size);
-
-		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
-				     ARRAY_SIZE(vq->iotlb_iov),
-				     VHOST_ACCESS_WO);
-		if (ret < 0)
-			goto out;
-		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
-		ret = copy_to_iter(from, size, &t);
-		if (ret == size)
-			ret = 0;
-	}
-out:
-	return ret;
-}
-
 static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 				void __user *from, unsigned size)
 {
@@ -1955,7 +1920,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
  * never a valid descriptor number) if none was found.  A negative code is
  * returned on error. */
 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vring_used_elem *used,
+		      struct vhost_used_elem *used,
 		      struct iovec iov[], unsigned int iov_size,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num)
@@ -2006,7 +1971,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 		return -EFAULT;
 	}
 
-	used->id = ring_head;
+	used->elem.id = ring_head;
 	head = vhost16_to_cpu(vq, ring_head);
 
 	/* If their number is silly, that's an error. */
@@ -2100,9 +2065,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
-	used->len = cpu_to_vhost32(vq, len);
+	used->elem.len = cpu_to_vhost32(vq, len);
 }
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
@@ -2116,7 +2081,7 @@ static void vhost_set_used_len(struct vhost_virtqueue *vq,
  *	returns number of buffer heads allocated, negative on error
  */
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -2189,7 +2154,7 @@ EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
+int vhost_add_used(struct vhost_virtqueue *vq, struct vhost_used_elem *used,
 		   int len)
 {
 	vhost_set_used_len(vq, used, len);
@@ -2198,27 +2163,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem *used,
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
-			    struct vring_used_elem *heads,
+			    struct vhost_used_elem *heads,
 			    unsigned count)
 {
 	struct vring_used_elem __user *used;
 	u16 old, new;
-	int start;
+	int start, i;
 
 	start = vq->last_used_idx & (vq->num - 1);
 	used = vq->used->ring + start;
-	if (count == 1) {
-		if (vhost_put_user(vq, heads[0].id, &used->id)) {
+	for (i = 0; i < count; i++) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.id,
+					    &used[i].id))) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
-		if (vhost_put_user(vq, heads[0].len, &used->len)) {
+		if (unlikely(vhost_put_user(vq, heads[i].elem.len,
+					    &used[i].len))) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
-	} else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
-		vq_err(vq, "Failed to write used");
-		return -EFAULT;
 	}
 	if (unlikely(vq->log_used)) {
 		/* Make sure data is seen before log. */
@@ -2242,7 +2206,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count)
 {
 	int start, n, r;
@@ -2326,7 +2290,7 @@ EXPORT_SYMBOL_GPL(vhost_signal);
 /* And here's the combo meal deal.  Supersize me! */
 void vhost_add_used_and_signal(struct vhost_dev *dev,
 			       struct vhost_virtqueue *vq,
-			       struct vring_used_elem *used, int len)
+			       struct vhost_used_elem *used, int len)
 {
 	vhost_add_used(vq, used, len);
 	vhost_signal(dev, vq);
@@ -2336,7 +2300,7 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 /* multi-buffer version of vhost_add_used_and_signal */
 void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 				 struct vhost_virtqueue *vq,
-				 struct vring_used_elem *heads, unsigned count)
+				 struct vhost_used_elem *heads, unsigned count)
 {
 	vhost_add_used_n(vq, heads, count);
 	vhost_signal(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a7cc7e7..8dea44b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -34,6 +34,10 @@ struct vhost_poll {
 	struct vhost_dev	 *dev;
 };
 
+struct vhost_used_elem {
+	struct vring_used_elem elem;
+};
+
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
@@ -126,7 +130,7 @@ struct vhost_virtqueue {
 	struct iovec iov[UIO_MAXIOV];
 	struct iovec iotlb_iov[64];
 	struct iovec *indirect;
-	struct vring_used_elem *heads;
+	struct vhost_used_elem *heads;
 	/* Protected by virtqueue mutex. */
 	struct vhost_umem *umem;
 	struct vhost_umem *iotlb;
@@ -182,12 +186,12 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
 bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
-		      struct vring_used_elem *used_elem,
+		      struct vhost_used_elem *used_elem,
 		      struct iovec iov[], unsigned int iov_count,
 		      unsigned int *out_num, unsigned int *in_num,
 		      struct vhost_log *log, unsigned int *log_num);
 int vhost_get_bufs(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *heads,
+		   struct vhost_used_elem *heads,
 		   int datalen,
 		   unsigned *iovcount,
 		   struct vhost_log *log,
@@ -198,13 +202,13 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *vq,
-		   struct vring_used_elem *elem, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		   struct vhost_used_elem *elem, int len);
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 		     unsigned count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *, int len);
+			       struct vhost_used_elem *, int len);
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
-			       struct vring_used_elem *heads, unsigned count);
+			       struct vhost_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 59a01cd..695694f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -98,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 
 	for (;;) {
 		struct virtio_vsock_pkt *pkt;
-		struct vring_used_elem used;
+		struct vhost_used_elem used;
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
@@ -146,7 +146,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = vhost32_to_cpu(vq, used.len);
+		len = vhost32_to_cpu(vq, used.elem.len);
 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
@@ -346,7 +346,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 						 dev);
 	struct virtio_vsock_pkt *pkt;
-	struct vring_used_elem used;
+	struct vhost_used_elem used;
 	int ret;
 	unsigned int out, in;
 	bool added = false;
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 4/8] vhost_net: do not explicitly manipulate vhost_used_elem
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

Two helpers of setting/getting used len were introduced to avoid
explicitly manipulating vhost_used_elem in zerocopy code. This will be
used to hide used_elem internals and simplify packed ring
implementation.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   | 11 +++++------
 drivers/vhost/vhost.c | 12 ++++++++++--
 drivers/vhost/vhost.h |  5 +++++
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 3826f1f..30273ad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,9 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	int j = 0;
 
 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-		if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
+		if (vhost_get_used_len(vq, &vq->heads[i]) ==
+		    VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
-		if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+		if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, &vq->heads[i]))) {
 			vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
 			++j;
 		} else
@@ -542,10 +543,8 @@ static void handle_tx(struct vhost_net *net)
 			struct ubuf_info *ubuf;
 			ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-			vq->heads[nvq->upend_idx].elem.id =
-				cpu_to_vhost32(vq, used.elem.id);
-			vq->heads[nvq->upend_idx].elem.len =
-				VHOST_DMA_IN_PROGRESS;
+			vhost_set_used_len(vq, &used, VHOST_DMA_IN_PROGRESS);
+			vq->heads[nvq->upend_idx] = used;
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index afd4119..2ef1859 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2064,11 +2064,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
-			       struct vhost_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used, int len)
 {
 	used->elem.len = cpu_to_vhost32(vq, len);
 }
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+		       struct vhost_used_elem *used)
+{
+	return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8dea44b..604821b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
 		   unsigned *log_num,
 		   unsigned int quota,
 		   s16 *count);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+			struct vhost_used_elem *used,
+			int len);
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+		       struct vhost_used_elem *used);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 5/8] vhost: vhost_put_user() can accept metadata type
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ef1859..8304c30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -811,7 +811,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr)		\
+#define vhost_put_user(vq, x, ptr, type)		\
 ({ \
 	int ret = -EFAULT; \
 	if (!vq->iotlb) { \
@@ -819,7 +819,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 	} else { \
 		__typeof__(ptr) to = \
 			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
-					  sizeof(*ptr), VHOST_ADDR_USED); \
+					  sizeof(*ptr), type); \
 		if (to != NULL) \
 			ret = __put_user(x, to); \
 		else \
@@ -1680,7 +1680,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
 	void __user *used;
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-			   &vq->used->flags) < 0)
+			   &vq->used->flags, VHOST_ADDR_USED) < 0)
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		/* Make sure the flag is seen before log. */
@@ -1699,7 +1699,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-			   vhost_avail_event(vq)))
+			   vhost_avail_event(vq), VHOST_ADDR_USED))
 		return -EFAULT;
 	if (unlikely(vq->log_used)) {
 		void __user *used;
@@ -2182,12 +2182,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	used = vq->used->ring + start;
 	for (i = 0; i < count; i++) {
 		if (unlikely(vhost_put_user(vq, heads[i].elem.id,
-					    &used[i].id))) {
+					    &used[i].id, VHOST_ADDR_USED))) {
 			vq_err(vq, "Failed to write used id");
 			return -EFAULT;
 		}
 		if (unlikely(vhost_put_user(vq, heads[i].elem.len,
-					    &used[i].len))) {
+					    &used[i].len, VHOST_ADDR_USED))) {
 			vq_err(vq, "Failed to write used len");
 			return -EFAULT;
 		}
@@ -2233,7 +2233,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 	/* Make sure buffer is written before we update index. */
 	smp_wmb();
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-			   &vq->used->idx)) {
+			   &vq->used->idx, VHOST_ADDR_USED)) {
 		vq_err(vq, "Failed to increment used idx");
 		return -EFAULT;
 	}
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 6/8] virtio: introduce packed ring defines
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/uapi/linux/virtio_config.h |  9 +++++++++
 include/uapi/linux/virtio_ring.h   | 13 +++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e209..5903d51 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -71,4 +71,13 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device in
+ * the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..e297580 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
 #define VRING_DESC_F_WRITE	2
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
+#define VRING_DESC_F_AVAIL      7
+#define VRING_DESC_F_USED	15
 
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
@@ -62,6 +64,17 @@
  * at the end of the used ring. Guest should ignore the used->flags field. */
 #define VIRTIO_RING_F_EVENT_IDX		29
 
+struct vring_desc_packed {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 7/8] vhost: packed ring support
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c   |   3 +-
 drivers/vhost/vhost.c | 539 ++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/vhost/vhost.h |   8 +-
 3 files changed, 513 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 30273ad..f55c82f8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -71,7 +71,8 @@ enum {
 	VHOST_NET_FEATURES = VHOST_FEATURES |
 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
+			 (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+			 (1ULL << VIRTIO_F_RING_PACKED)
 };
 
 enum {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8304c30..f2a0f5b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -323,6 +323,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
 	vq->busyloop_timeout = 0;
+	vq->used_wrap_counter = true;
+	vq->avail_wrap_counter = true;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
 	__vhost_vq_meta_reset(vq);
@@ -1100,11 +1102,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
 	return 0;
 }
 
-static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
-			 struct vring_desc __user *desc,
-			 struct vring_avail __user *avail,
-			 struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+			       struct vring_desc __user *desc,
+			       struct vring_avail __user *avail,
+			       struct vring_used __user *used)
+{
+	struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+
+	/* FIXME: check device area and driver area */
+	return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
+	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+}
 
+static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
+			      struct vring_desc __user *desc,
+			      struct vring_avail __user *avail,
+			      struct vring_used __user *used)
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
@@ -1115,6 +1128,17 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 			sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
+			struct vring_desc __user *desc,
+			struct vring_avail __user *avail,
+			struct vring_used __user *used)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vq_access_ok_packed(vq, num, desc, avail, used);
+	else
+		return vq_access_ok_split(vq, num, desc, avail, used);
+}
+
 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
 				 const struct vhost_umem_node *node,
 				 int type)
@@ -1358,6 +1382,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 			break;
 		}
 		vq->last_avail_idx = s.num;
+		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+			vq->avail_wrap_counter = s.num >> 31;
 		/* Forget the cached index value. */
 		vq->avail_idx = vq->last_avail_idx;
 		break;
@@ -1366,6 +1392,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 		s.num = vq->last_avail_idx;
 		if (copy_to_user(argp, &s, sizeof s))
 			r = -EFAULT;
+		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+			s.num |= vq->avail_wrap_counter << 31;
 		break;
 	case VHOST_SET_VRING_ADDR:
 		if (copy_from_user(&a, argp, sizeof a)) {
@@ -1727,6 +1755,9 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
 
 	vhost_init_is_le(vq);
 
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return 0;
+
 	r = vhost_update_used_flags(vq);
 	if (r)
 		goto err;
@@ -1800,7 +1831,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain,
  * or -1U if we're at the end. */
-static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
+static unsigned next_desc_split(struct vhost_virtqueue *vq,
+				struct vring_desc *desc)
 {
 	unsigned int next;
 
@@ -1813,11 +1845,17 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
 	return next;
 }
 
-static int get_indirect(struct vhost_virtqueue *vq,
-			struct iovec iov[], unsigned int iov_size,
-			unsigned int *out_num, unsigned int *in_num,
-			struct vhost_log *log, unsigned int *log_num,
-			struct vring_desc *indirect)
+static unsigned next_desc_packed(struct vhost_virtqueue *vq,
+				 struct vring_desc_packed *desc)
+{
+	return desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT);
+}
+
+static int get_indirect_split(struct vhost_virtqueue *vq,
+			      struct iovec iov[], unsigned int iov_size,
+			      unsigned int *out_num, unsigned int *in_num,
+			      struct vhost_log *log, unsigned int *log_num,
+			      struct vring_desc *indirect)
 {
 	struct vring_desc desc;
 	unsigned int i = 0, count, found = 0;
@@ -1907,23 +1945,274 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(vq, &desc)) != -1);
+	} while ((i = next_desc_split(vq, &desc)) != -1);
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found.  A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
-		      struct vhost_used_elem *used,
-		      struct iovec iov[], unsigned int iov_size,
-		      unsigned int *out_num, unsigned int *in_num,
-		      struct vhost_log *log, unsigned int *log_num)
+static int get_indirect_packed(struct vhost_virtqueue *vq,
+			       struct iovec iov[], unsigned int iov_size,
+			       unsigned int *out_num, unsigned int *in_num,
+			       struct vhost_log *log, unsigned int *log_num,
+			       struct vring_desc_packed *indirect)
+{
+	struct vring_desc_packed desc;
+	unsigned int i = 0, count, found = 0;
+	u32 len = vhost32_to_cpu(vq, indirect->len);
+	struct iov_iter from;
+	int ret, access;
+
+	/* Sanity check */
+	if (unlikely(len % sizeof(desc))) {
+		vq_err(vq, "Invalid length in indirect descriptor: "
+		       "len 0x%llx not multiple of 0x%zx\n",
+		       (unsigned long long)len,
+		       sizeof desc);
+		return -EINVAL;
+	}
+
+	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr),
+			     len, vq->indirect,
+			     UIO_MAXIOV, VHOST_ACCESS_RO);
+	if (unlikely(ret < 0)) {
+		if (ret != -EAGAIN)
+			vq_err(vq, "Translation failure %d in indirect.\n",
+			       ret);
+		return ret;
+	}
+	iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+	/* We will use the result as an address to read from, so most
+	 * architectures only need a compiler barrier here. */
+	read_barrier_depends();
+
+	count = len / sizeof desc;
+	/* Buffers are chained via a 16 bit next field, so
+	 * we can have at most 2^16 of these. */
+	if (unlikely(count > USHRT_MAX + 1)) {
+		vq_err(vq, "Indirect buffer length too big: %d\n",
+		       indirect->len);
+		return -E2BIG;
+	}
+
+	do {
+		unsigned iov_count = *in_num + *out_num;
+		if (unlikely(++found > count)) {
+			vq_err(vq, "Loop detected: last one at %u "
+			       "indirect size %u\n",
+			       i, count);
+			return -EINVAL;
+		}
+		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc),
+						  &from))) {
+			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+				  + i * sizeof desc);
+			return -EINVAL;
+		}
+		if (unlikely(desc.flags &
+			     cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+			       i, (size_t)vhost64_to_cpu(vq, indirect->addr)
+				  + i * sizeof desc);
+			return -EINVAL;
+		}
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len),
+				     iov + iov_count,
+				     iov_size - iov_count, access);
+		if (unlikely(ret < 0)) {
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d "
+					   "indirect idx %d\n",
+				       ret, i);
+			return ret;
+		}
+		/* If this is an input descriptor, increment that count. */
+		if (access == VHOST_ACCESS_WO) {
+			*in_num += ret;
+			if (unlikely(log)) {
+				log[*log_num].addr =
+					vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len =
+					vhost32_to_cpu(vq, desc.len);
+				++*log_num;
+			}
+		} else {
+			/* If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors. */
+			if (unlikely(*in_num)) {
+				vq_err(vq, "Indirect descriptor "
+				       "has out after in: idx %d\n", i);
+				return -EINVAL;
+			}
+			*out_num += ret;
+		}
+		i++;
+	} while (next_desc_packed(vq, &desc));
+	return 0;
+}
+
+#define DESC_AVAIL (1 << VRING_DESC_F_AVAIL)
+#define DESC_USED  (1 << VRING_DESC_F_USED)
+static bool desc_is_avail(struct vhost_virtqueue *vq, __virtio16 flags)
+{
+	bool avail = flags & cpu_to_vhost16(vq, DESC_AVAIL);
+
+	return avail == vq->avail_wrap_counter;
+}
+
+static __virtio16 get_desc_flags(struct vhost_virtqueue *vq, bool write)
+{
+	__virtio16 flags = 0;
+
+	if (vq->used_wrap_counter) {
+		flags |= cpu_to_vhost16(vq, DESC_AVAIL);
+		flags |= cpu_to_vhost16(vq, DESC_USED);
+	} else {
+		flags &= ~cpu_to_vhost16(vq, DESC_AVAIL);
+		flags &= ~cpu_to_vhost16(vq, DESC_USED);
+	}
+
+	if (write)
+		flags |= cpu_to_vhost16(vq, VRING_DESC_F_WRITE);
+
+	return flags;
+}
+
+static int vhost_get_vq_desc_packed(struct vhost_virtqueue *vq,
+				    struct vhost_used_elem *used,
+				    struct iovec iov[], unsigned int iov_size,
+				    unsigned int *out_num, unsigned int *in_num,
+				    struct vhost_log *log,
+				    unsigned int *log_num)
+{
+	struct vring_desc_packed desc;
+	int ret, access, i;
+
+	/* When we start there are none of either input nor output. */
+	*out_num = *in_num = 0;
+	if (unlikely(log))
+		*log_num = 0;
+
+	used->count = 0;
+
+	do {
+		struct vring_desc_packed *d = vq->desc_packed +
+					      vq->last_avail_idx;
+		unsigned int iov_count = *in_num + *out_num;
+
+		ret = vhost_get_user(vq, desc.flags, &d->flags,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+			       vq->last_avail_idx, &d->flags);
+			return -EFAULT;
+		}
+
+		if (!desc_is_avail(vq, desc.flags)) {
+			/* If there's nothing new since last we looked, return
+			 * invalid.
+			 */
+			if (!used->count)
+				return -ENOSPC;
+			vq_err(vq, "Unexpected unavail descriptor: idx %d\n",
+			       vq->last_avail_idx);
+			return -EFAULT;
+		}
+
+		/* Read desc content after we're sure it was available. */
+		smp_rmb();
+
+		ret = vhost_copy_from_user(vq, &desc, d, sizeof(desc));
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+				vq->last_avail_idx, d);
+			return -EFAULT;
+		}
+
+		used->elem.id = desc.id;
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
+			ret = get_indirect_packed(vq, iov, iov_size,
+						  out_num, in_num, log,
+						  log_num, &desc);
+			if (unlikely(ret < 0)) {
+				if (ret != -EAGAIN)
+					vq_err(vq, "Failure detected "
+						   "in indirect descriptor "
+						   "at idx %d\n", i);
+				return ret;
+			}
+			goto next;
+		}
+
+		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
+			access = VHOST_ACCESS_WO;
+		else
+			access = VHOST_ACCESS_RO;
+		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
+				     vhost32_to_cpu(vq, desc.len),
+				     iov + iov_count, iov_size - iov_count,
+				     access);
+		if (unlikely(ret < 0)) {
+			if (ret != -EAGAIN)
+				vq_err(vq, "Translation failure %d idx %d\n",
+					ret, i);
+			return ret;
+		}
+
+		if (access == VHOST_ACCESS_WO) {
+			/* If this is an input descriptor,
+			 * increment that count.
+			 */
+			*in_num += ret;
+			if (unlikely(log)) {
+				log[*log_num].addr =
+					vhost64_to_cpu(vq, desc.addr);
+				log[*log_num].len =
+					vhost32_to_cpu(vq, desc.len);
+				++*log_num;
+			}
+		} else {
+			/* If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors.
+			 */
+			if (unlikely(*in_num)) {
+				vq_err(vq, "Desc out after in: idx %d\n",
+				       i);
+				return -EINVAL;
+			}
+			*out_num += ret;
+		}
+
+next:
+		if (unlikely(++used->count > vq->num)) {
+			vq_err(vq, "Loop detected: last one at %u "
+			       "vq size %u head %u\n",
+			       i, vq->num, used->elem.id);
+			return -EINVAL;
+		}
+		if (++vq->last_avail_idx >= vq->num) {
+			vq->last_avail_idx = 0;
+			vq->avail_wrap_counter ^= 1;
+		}
+	/* If this descriptor says it doesn't chain, we're done. */
+	} while (next_desc_packed(vq, &desc));
+
+	return 0;
+}
+
+static int vhost_get_vq_desc_split(struct vhost_virtqueue *vq,
+				   struct vhost_used_elem *used,
+				   struct iovec iov[], unsigned int iov_size,
+				   unsigned int *out_num, unsigned int *in_num,
+				   struct vhost_log *log, unsigned int *log_num)
 {
 	struct vring_desc desc;
 	unsigned int i, head, found = 0;
@@ -2008,9 +2297,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			return -EFAULT;
 		}
 		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
-			ret = get_indirect(vq, iov, iov_size,
-					   out_num, in_num,
-					   log, log_num, &desc);
+			ret = get_indirect_split(vq, iov, iov_size,
+						 out_num, in_num,
+						 log, log_num, &desc);
 			if (unlikely(ret < 0)) {
 				if (ret != -EAGAIN)
 					vq_err(vq, "Failure detected "
@@ -2052,7 +2341,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 			}
 			*out_num += ret;
 		}
-	} while ((i = next_desc(vq, &desc)) != -1);
+	} while ((i = next_desc_split(vq, &desc)) != -1);
 
 	/* On success, increment avail index. */
 	vq->last_avail_idx++;
@@ -2062,6 +2351,31 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return 0;
 }
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found.  A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+		      struct vhost_used_elem *used,
+		      struct iovec iov[], unsigned int iov_size,
+		      unsigned int *out_num, unsigned int *in_num,
+		      struct vhost_log *log, unsigned int *log_num)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_get_vq_desc_packed(vq, used, iov, iov_size,
+						out_num, in_num,
+						log, log_num);
+	else
+		return vhost_get_vq_desc_split(vq, used, iov, iov_size,
+					       out_num, in_num,
+					       log, log_num);
+}
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
 void vhost_set_used_len(struct vhost_virtqueue *vq,
@@ -2157,6 +2471,11 @@ EXPORT_SYMBOL_GPL(vhost_get_bufs);
 void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
 {
 	vq->last_avail_idx -= n;
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED) &&
+	    vq->last_avail_idx >= vq->num) {
+		vq->avail_wrap_counter ^= 1;
+		vq->last_avail_idx += vq->num;
+	}
 }
 EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
 
@@ -2212,10 +2531,69 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 	return 0;
 }
 
+static int vhost_add_used_n_packed(struct vhost_virtqueue *vq,
+				   struct vhost_used_elem *heads,
+				   unsigned int count)
+{
+	struct vring_desc_packed __user *desc;
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		desc = vq->desc_packed + vq->last_used_idx;
+
+		ret = vhost_put_user(vq, heads[i].elem.id, &desc->id,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update id: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+		ret = vhost_put_user(vq, heads[i].elem.len, &desc->len,
+				     VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update len: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+
+		/* Update flags after descriptor id and len is wrote,
+		 * TODO: Update head flags at last for saving barriers */
+		smp_wmb();
+
+		ret = vhost_put_user(vq, get_desc_flags(vq, heads[i].elem.len),
+				     &desc->flags, VHOST_ADDR_DESC);
+		if (unlikely(ret)) {
+			vq_err(vq, "Failed to update flags: idx %d addr %p\n",
+			       vq->last_used_idx, desc);
+			return -EFAULT;
+		}
+
+		if (unlikely(vq->log_used)) {
+			/* Make sure desc is written before update log. */
+			smp_wmb();
+			log_write(vq->log_base, vq->log_addr +
+				  vq->last_used_idx * sizeof(*desc),
+				  sizeof(*desc));
+			if (vq->log_ctx)
+				eventfd_signal(vq->log_ctx, 1);
+		}
+
+		vq->last_used_idx += heads[i].count;
+		if (vq->last_used_idx >= vq->num) {
+			vq->used_wrap_counter ^= 1;
+			vq->last_used_idx -= vq->num;
+		}
+	}
+
+	return 0;
+}
+
 /* After we've used one of their buffers, we tell them about it.  We'll then
  * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
-		     unsigned count)
+static int vhost_add_used_n_split(struct vhost_virtqueue *vq,
+				  struct vhost_used_elem *heads,
+				  unsigned count)
+
 {
 	int start, n, r;
 
@@ -2247,6 +2625,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads,
 	}
 	return r;
 }
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd.
+ */
+int vhost_add_used_n(struct vhost_virtqueue *vq,
+		     struct vhost_used_elem *heads,
+		     unsigned int count)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_add_used_n_packed(vq, heads, count);
+	else
+		return vhost_add_used_n_split(vq, heads, count);
+}
 EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
 static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
@@ -2254,6 +2645,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
+
+	/* FIXME: check driver area */
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return false;
+
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
@@ -2316,7 +2712,8 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
 /* return true if we're sure that avaiable ring is empty */
-bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_vq_avail_empty_split(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
@@ -2331,10 +2728,58 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	return vq->avail_idx == vq->last_avail_idx;
 }
+
+static bool vhost_vq_avail_empty_packed(struct vhost_dev *dev,
+					struct vhost_virtqueue *vq)
+{
+	struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+	__virtio16 flags;
+	int ret;
+
+	ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+	if (unlikely(ret)) {
+		vq_err(vq, "Failed to get flags: idx %d addr %p\n",
+			vq->last_avail_idx, d);
+		return -EFAULT;
+	}
+
+	return !desc_is_avail(vq, flags);
+}
+
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_vq_avail_empty_packed(dev, vq);
+	else
+		return vhost_vq_avail_empty_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-/* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_enable_notify_packed(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
+{
+	struct vring_desc_packed *d = vq->desc_packed + vq->last_avail_idx;
+	__virtio16 flags;
+	int ret;
+
+	/* FIXME: disable notification through device area */
+
+	/* They could have slipped one in as we were doing that: make
+	 * sure it's written, then check again. */
+	smp_mb();
+
+	ret = vhost_get_user(vq, flags, &d->flags, VHOST_ADDR_DESC);
+	if (unlikely(ret)) {
+		vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+			vq->last_avail_idx, &d->flags);
+		return -EFAULT;
+	}
+
+	return desc_is_avail(vq, flags);
+}
+
+static bool vhost_enable_notify_split(struct vhost_dev *dev,
+				      struct vhost_virtqueue *vq)
 {
 	__virtio16 avail_idx;
 	int r;
@@ -2369,10 +2814,25 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 
 	return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
 }
+
+/* OK, now we need to know about added descriptors. */
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_enable_notify_packed(dev, vq);
+	else
+		return vhost_enable_notify_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
-/* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static void vhost_disable_notify_packed(struct vhost_dev *dev,
+					struct vhost_virtqueue *vq)
+{
+	/* FIXME: disable notification through device area */
+}
+
+static void vhost_disable_notify_split(struct vhost_dev *dev,
+				       struct vhost_virtqueue *vq)
 {
 	int r;
 
@@ -2386,6 +2846,15 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 			       &vq->used->flags, r);
 	}
 }
+
+/* We don't need to be notified again. */
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_disable_notify_packed(dev, vq);
+	else
+		return vhost_disable_notify_split(dev, vq);
+}
 EXPORT_SYMBOL_GPL(vhost_disable_notify);
 
 /* Create a new message. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 604821b..286b470 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -36,6 +36,7 @@ struct vhost_poll {
 
 struct vhost_used_elem {
 	struct vring_used_elem elem;
+	int count;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
@@ -91,7 +92,10 @@ struct vhost_virtqueue {
 	/* The actual ring of buffers. */
 	struct mutex mutex;
 	unsigned int num;
-	struct vring_desc __user *desc;
+	union {
+		struct vring_desc __user *desc;
+		struct vring_desc_packed __user *desc_packed;
+	};
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
@@ -148,6 +152,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	bool used_wrap_counter;
+	bool avail_wrap_counter;
 };
 
 struct vhost_msg_node {
-- 
2.7.4

^ permalink raw reply related

* [RFC V4 PATCH 8/8] vhost: event suppression for packed ring
From: Jason Wang @ 2018-05-16 12:32 UTC (permalink / raw)
  To: mst, jasowang
  Cc: kvm, virtualization, netdev, linux-kernel, jfreimann, wexu,
	tiwei.bie
In-Reply-To: <1526473941-16199-1-git-send-email-jasowang@redhat.com>

This patch introduces basic support for event suppression aka driver
and device area.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c            | 168 ++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h            |  10 ++-
 include/uapi/linux/virtio_ring.h |  19 +++++
 3 files changed, 182 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f2a0f5b..afdf4c1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1108,10 +1108,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
 			       struct vring_used __user *used)
 {
 	struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+	struct vring_packed_desc_event *driver_event =
+		(struct vring_packed_desc_event *)avail;
+	struct vring_packed_desc_event *device_event =
+		(struct vring_packed_desc_event *)used;
 
-	/* FIXME: check device area and driver area */
 	return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
-	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+	       access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+	       access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+	       access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
 }
 
 static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1186,14 +1191,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+	int num = vq->num;
+
+	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_RO,
+			       (u64)(uintptr_t)vq->driver_event,
+			       sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+	       iotlb_access_ok(vq, VHOST_ACCESS_WO,
+			       (u64)(uintptr_t)vq->device_event,
+			       sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
 {
 	size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
-		return 1;
-
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
 	       iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1205,6 +1223,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
 			       num * sizeof(*vq->used->ring) + s,
 			       VHOST_ADDR_USED);
 }
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+	if (!vq->iotlb)
+		return 1;
+
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vq_iotlb_prefetch_packed(vq);
+	else
+		return vq_iotlb_prefetch_split(vq);
+}
 EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
 
 /* Can we log writes? */
@@ -1724,6 +1753,29 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 	return 0;
 }
 
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+				     __virtio16 device_flags)
+{
+	void __user *flags;
+
+	if (vhost_put_user(vq, cpu_to_vhost16(vq, device_flags),
+			   &vq->device_event->flags,
+			   VHOST_ADDR_USED) < 0)
+		return -EFAULT;
+	if (unlikely(vq->log_used)) {
+		/* Make sure the flag is seen before log. */
+		smp_wmb();
+		/* Log used flag write. */
+		flags = &vq->device_event->flags;
+		log_write(vq->log_base, vq->log_addr +
+			  (flags - (void __user *)vq->device_event),
+			  sizeof(vq->used->flags));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	return 0;
+}
+
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
 {
 	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
@@ -2640,16 +2692,13 @@ int vhost_add_used_n(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_n);
 
-static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify_split(struct vhost_dev *dev,
+			       struct vhost_virtqueue *vq)
 {
 	__u16 old, new;
 	__virtio16 event;
 	bool v;
 
-	/* FIXME: check driver area */
-	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
-		return false;
-
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
@@ -2682,6 +2731,78 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
 }
 
+static bool vhost_vring_packed_need_event(struct vhost_virtqueue *vq,
+					  __u16 off_wrap, __u16 new,
+					  __u16 old)
+{
+    bool wrap = vq->used_wrap_counter;
+    int off = off_wrap & ~(1 << 15);
+
+    if (new < old) {
+	    new += vq->num;
+	    wrap ^= 1;
+    }
+
+    if (wrap != off_wrap >> 15)
+	    off += vq->num;
+
+    return vring_need_event(off, new, old);
+}
+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+				struct vhost_virtqueue *vq)
+{
+	__virtio16 event_off_wrap, event_flags;
+	__u16 old, new, off_wrap;
+	bool v;
+
+	/* Flush out used descriptors updates. This is paired
+	 * with the barrier that the Guest executes when enabling
+	 * interrupts.
+	 */
+	smp_mb();
+
+	if (vhost_get_avail(vq, event_flags,
+			   &vq->driver_event->flags) < 0) {
+		vq_err(vq, "Failed to get driver desc_event_flags");
+		return true;
+	}
+
+	if (event_flags == cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE))
+		return false;
+	else if (event_flags == cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE))
+		return true;
+
+	/* Read desc event flags before event_off and event_wrap */
+	smp_rmb();
+
+	if (vhost_get_avail(vq, event_off_wrap,
+			    &vq->driver_event->off_warp) < 0) {
+		vq_err(vq, "Failed to get driver desc_event_off/wrap");
+		return true;
+	}
+
+	off_wrap = vhost16_to_cpu(vq, event_off_wrap);
+
+	old = vq->signalled_used;
+	v = vq->signalled_used_valid;
+	new = vq->signalled_used = vq->last_used_idx;
+	vq->signalled_used_valid = true;
+
+	if (unlikely(!v))
+		return true;
+
+	return vhost_vring_packed_need_event(vq, off_wrap, new, old);
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+	if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+		return vhost_notify_packed(dev, vq);
+	else
+		return vhost_notify_split(dev, vq);
+}
+
 /* This actually signals the guest, using eventfd. */
 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
@@ -2762,7 +2883,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev *dev,
 	__virtio16 flags;
 	int ret;
 
-	/* FIXME: disable notification through device area */
+	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+		return false;
+	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+	flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+	ret = vhost_update_device_flags(vq, flags);
+	if (ret) {
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+		       &vq->device_event->flags, ret);
+		return false;
+	}
 
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
@@ -2828,7 +2959,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
 static void vhost_disable_notify_packed(struct vhost_dev *dev,
 					struct vhost_virtqueue *vq)
 {
-	/* FIXME: disable notification through device area */
+	__virtio16 flags;
+	int r;
+
+	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+		return;
+	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+	flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+	r = vhost_update_device_flags(vq, flags);
+	if (r)
+		vq_err(vq, "Failed to enable notification at %p: %d\n",
+		       &vq->device_event->flags, r);
 }
 
 static void vhost_disable_notify_split(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 286b470..0750659 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
 		struct vring_desc __user *desc;
 		struct vring_desc_packed __user *desc_packed;
 	};
-	struct vring_avail __user *avail;
-	struct vring_used __user *used;
+	union {
+		struct vring_avail __user *avail;
+		struct vring_packed_desc_event __user *driver_event;
+	};
+	union {
+		struct vring_used __user *used;
+		struct vring_packed_desc_event __user *device_event;
+	};
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index e297580..b3f4b2c 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -75,6 +75,25 @@ struct vring_desc_packed {
 	__virtio16 flags;
 };
 
+/* Enable events */
+#define RING_EVENT_FLAGS_ENABLE 0x0
+/* Disable events */
+#define RING_EVENT_FLAGS_DISABLE 0x1
+/*
+ * Enable events for a specific descriptor
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
+ */
+#define RING_EVENT_FLAGS_DESC 0x2
+/* The value 0x3 is reserved */
+
+struct vring_packed_desc_event {
+	/* Descriptor Ring Change Event Offset and Wrap Counter */
+	__virtio16 off_warp;
+	/* Descriptor Ring Change Event Flags */
+	__virtio16 flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC v4 3/5] virtio_ring: add packed ring support
From: Tiwei Bie @ 2018-05-16 12:39 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <2000f635-bc34-71ff-ff51-a711c2e9726d@redhat.com>

On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> On 2018年05月16日 16:37, Tiwei Bie wrote:
[...]
> >   struct vring_virtqueue {
> > @@ -116,6 +117,9 @@ struct vring_virtqueue {
> >   			/* Last written value to driver->flags in
> >   			 * guest byte order. */
> >   			u16 event_flags_shadow;
> > +
> > +			/* ID allocation. */
> > +			struct idr buffer_id;
> 
> I'm not sure idr is fit for the performance critical case here. Need to
> measure its performance impact, especially if we have few unused slots.

I'm also not sure.. But fortunately, it should be quite easy
to replace it with something else without changing other code.
If it will really hurt the performance, I'll change it.

> 
> >   		};
> >   	};
[...]
> > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > +			      unsigned int id, void **ctx)
> > +{
> > +	struct vring_packed_desc *desc;
> > +	unsigned int i, j;
> > +
> > +	/* Clear data ptr. */
> > +	vq->desc_state[id].data = NULL;
> > +
> > +	i = head;
> > +
> > +	for (j = 0; j < vq->desc_state[id].num; j++) {
> > +		desc = &vq->vring_packed.desc[i];
> > +		vring_unmap_one_packed(vq, desc);
> 
> As mentioned in previous discussion, this probably won't work for the case
> of out of order completion since it depends on the information in the
> descriptor ring. We probably need to extend ctx to record such information.

Above code doesn't depend on the information in the descriptor
ring. The vq->desc_state[] is the extended ctx.

Best regards,
Tiwei Bie

^ permalink raw reply

* [PATCH net] tuntap: fix use after free during release
From: Jason Wang @ 2018-05-16 12:39 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: xiyou.wangcong, avagin, mst, Jason Wang

After commit b196d88aba8a ("tun: fix use after free for ptr_ring") we
need clean up tx ring during release(). But unfortunately, it tries to
do the cleanup blindly after socket were destroyed which will lead
another use-after-free. Fix this by doing the cleanup before dropping
the last reference of the socket in __tun_detach().

Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Fixes: b196d88aba8a ("tun: fix use after free for ptr_ring")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9fbbb32..d45ac37 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -729,6 +729,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 		sock_put(&tfile->sk);
 	}
 }
@@ -3245,7 +3246,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
-	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox