Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* Re: [PATCH] virtio_ring: fix num_free handling in error case
From: Cornelia Huck @ 2018-02-23 12:47 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, linux-kernel, stable, virtualization, Andy Lutomirski
In-Reply-To: <20180223114130.16332-1-tiwei.bie@intel.com>

On Fri, 23 Feb 2018 19:41:30 +0800
Tiwei Bie <tiwei.bie@intel.com> wrote:

> The vq->vq.num_free hasn't been changed when error happens,
> so it shouldn't be changed when handling the error.
> 
> Fixes: 780bc7903a32 ("virtio_ring: Support DMA APIs")
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  drivers/virtio/virtio_ring.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index eb30f3e09a47..71458f493cf8 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -428,8 +428,6 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
>  	}
>  
> -	vq->vq.num_free += total_sg;
> -
>  	if (indirect)
>  		kfree(desc);
>  

Reviewed-by: Cornelia Huck <cohuck@redhat.com>

^ permalink raw reply

* [PATCH] virtio_ring: fix num_free handling in error case
From: Tiwei Bie @ 2018-02-23 11:41 UTC (permalink / raw)
  To: mst, jasowang, virtualization, linux-kernel; +Cc: stable, Andy Lutomirski

The vq->vq.num_free hasn't been changed when error happens,
so it shouldn't be changed when handling the error.

Fixes: 780bc7903a32 ("virtio_ring: Support DMA APIs")
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index eb30f3e09a47..71458f493cf8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -428,8 +428,6 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
 	}
 
-	vq->vq.num_free += total_sg;
-
 	if (indirect)
 		kfree(desc);
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC 2/2] virtio_ring: support packed ring
From: Tiwei Bie @ 2018-02-23 11:18 UTC (permalink / raw)
  To: mst, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180223111801.15240-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
 include/linux/virtio_ring.h  |   8 +-
 2 files changed, 618 insertions(+), 89 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index eb30f3e09a47..393778a2f809 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,14 +58,14 @@
 
 struct vring_desc_state {
 	void *data;			/* Data for callback. */
-	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
+	void *indir_desc;		/* Indirect descriptor, if any. */
+	int num;			/* Descriptor list length. */
 };
 
 struct vring_virtqueue {
 	struct virtqueue vq;
 
-	/* Actual memory layout for this queue */
-	struct vring vring;
+	bool packed;
 
 	/* Can we use weak barriers? */
 	bool weak_barriers;
@@ -87,11 +87,28 @@ struct vring_virtqueue {
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
-	/* Last written value to avail->flags */
-	u16 avail_flags_shadow;
-
-	/* Last written value to avail->idx in guest byte order */
-	u16 avail_idx_shadow;
+	union {
+		/* Available for split ring */
+		struct {
+			/* Actual memory layout for this queue */
+			struct vring vring;
+
+			/* Last written value to avail->flags */
+			u16 avail_flags_shadow;
+
+			/* Last written value to avail->idx in
+			 * guest byte order */
+			u16 avail_idx_shadow;
+		};
+
+		/* Available for packed ring */
+		struct {
+			/* Actual memory layout for this queue */
+			struct vring_packed vring_packed;
+			u8 wrap_counter : 1;
+			bool chaining;
+		};
+	};
 
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
@@ -201,26 +218,37 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 			      cpu_addr, size, direction);
 }
 
-static void vring_unmap_one(const struct vring_virtqueue *vq,
-			    struct vring_desc *desc)
+static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
 {
+	u64 addr;
+	u32 len;
 	u16 flags;
 
 	if (!vring_use_dma_api(vq->vq.vdev))
 		return;
 
-	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+	if (vq->packed) {
+		struct vring_packed_desc *desc = _desc;
+
+		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
+		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
+		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+	} else {
+		struct vring_desc *desc = _desc;
+
+		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
+		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
+		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+	}
 
 	if (flags & VRING_DESC_F_INDIRECT) {
 		dma_unmap_single(vring_dma_dev(vq),
-				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
-				 virtio32_to_cpu(vq->vq.vdev, desc->len),
+				 addr, len,
 				 (flags & VRING_DESC_F_WRITE) ?
 				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	} else {
 		dma_unmap_page(vring_dma_dev(vq),
-			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
-			       virtio32_to_cpu(vq->vq.vdev, desc->len),
+			       addr, len,
 			       (flags & VRING_DESC_F_WRITE) ?
 			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	}
@@ -235,8 +263,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
 	return dma_mapping_error(vring_dma_dev(vq), addr);
 }
 
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
-					 unsigned int total_sg, gfp_t gfp)
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+					       unsigned int total_sg,
+					       gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned int i;
@@ -257,14 +286,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 	return desc;
 }
 
-static inline int virtqueue_add(struct virtqueue *_vq,
-				struct scatterlist *sgs[],
-				unsigned int total_sg,
-				unsigned int out_sgs,
-				unsigned int in_sgs,
-				void *data,
-				void *ctx,
-				gfp_t gfp)
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+						       unsigned int total_sg,
+						       gfp_t gfp)
+{
+	struct vring_packed_desc *desc;
+
+	/*
+	 * We require lowmem mappings for the descriptors because
+	 * otherwise virt_to_phys will give us bogus addresses in the
+	 * virtqueue.
+	 */
+	gfp &= ~__GFP_HIGHMEM;
+
+	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+	return desc;
+}
+
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+				      struct scatterlist *sgs[],
+				      unsigned int total_sg,
+				      unsigned int out_sgs,
+				      unsigned int in_sgs,
+				      void *data,
+				      void *ctx,
+				      gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
@@ -303,7 +350,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
 	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
-		desc = alloc_indirect(_vq, total_sg, gfp);
+		desc = alloc_indirect_split(_vq, total_sg, gfp);
 	else {
 		desc = NULL;
 		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
@@ -437,6 +484,243 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	return -EIO;
 }
 
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+				       struct scatterlist *sgs[],
+				       unsigned int total_sg,
+				       unsigned int out_sgs,
+				       unsigned int in_sgs,
+				       void *data,
+				       void *ctx,
+				       gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_packed_desc *desc;
+	struct scatterlist *sg;
+	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+	__virtio16 uninitialized_var(head_flags), flags;
+	int head, wrap_counter;
+	bool indirect;
+
+	START_USE(vq);
+
+	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return -EIO;
+	}
+
+	if (total_sg > 1 && !vq->chaining && !vq->indirect) {
+		END_USE(vq);
+		return -ENOTSUPP;
+	}
+
+#ifdef DEBUG
+	{
+		ktime_t now = ktime_get();
+
+		/* No kick or get, with .1 second between?  Warn. */
+		if (vq->last_add_time_valid)
+			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+					    > 100);
+		vq->last_add_time = now;
+		vq->last_add_time_valid = true;
+	}
+#endif
+
+	BUG_ON(total_sg == 0);
+
+	head = vq->free_head;
+	wrap_counter = vq->wrap_counter;
+
+	/* If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect. FIXME: tune this threshold */
+	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+		desc = alloc_indirect_packed(_vq, total_sg, gfp);
+	else {
+		desc = NULL;
+		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
+	}
+
+	if (desc) {
+		/* Use a single buffer which doesn't continue */
+		indirect = true;
+		/* Set up rest to use this indirect table. */
+		i = 0;
+		descs_used = 1;
+	} else {
+		indirect = false;
+		desc = vq->vring_packed.desc;
+		i = head;
+		descs_used = total_sg;
+
+		if (total_sg > 1 && !vq->chaining) {
+			END_USE(vq);
+			return -ENOTSUPP;
+		}
+	}
+
+	if (vq->vq.num_free < descs_used) {
+		pr_debug("Can't add buf len %i - avail = %i\n",
+			 descs_used, vq->vq.num_free);
+		/* FIXME: for historical reasons, we force a notify here if
+		 * there are outgoing parts to the buffer.  Presumably the
+		 * host should service the ring ASAP. */
+		if (out_sgs)
+			vq->notify(&vq->vq);
+		if (indirect)
+			kfree(desc);
+		END_USE(vq);
+		return -ENOSPC;
+	}
+
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
+			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+					VRING_DESC_F_AVAIL(vq->wrap_counter) |
+					VRING_DESC_F_USED(!vq->wrap_counter));
+			if (!indirect && i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
+			prev = i;
+			i++;
+			if (!indirect && i >= vq->vring_packed.num) {
+				i = 0;
+				vq->wrap_counter ^= 1;
+			}
+		}
+	}
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+			if (vring_mapping_error(vq, addr))
+				goto unmap_release;
+
+			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+					VRING_DESC_F_WRITE |
+					VRING_DESC_F_AVAIL(vq->wrap_counter) |
+					VRING_DESC_F_USED(!vq->wrap_counter));
+			if (!indirect && i == head)
+				head_flags = flags;
+			else
+				desc[i].flags = flags;
+
+			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
+			prev = i;
+			i++;
+			if (!indirect && i >= vq->vring_packed.num) {
+				i = 0;
+				vq->wrap_counter ^= 1;
+			}
+		}
+	}
+	/* Last one doesn't continue. */
+	if (!indirect && (head + 1) % vq->vring_packed.num == i)
+		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+	else
+		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+
+	if (indirect) {
+		/* FIXME: to be implemented */
+
+		/* Now that the indirect table is filled in, map it. */
+		dma_addr_t addr = vring_map_single(
+			vq, desc, total_sg * sizeof(struct vring_packed_desc),
+			DMA_TO_DEVICE);
+		if (vring_mapping_error(vq, addr))
+			goto unmap_release;
+
+		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
+					     VRING_DESC_F_AVAIL(wrap_counter) |
+					     VRING_DESC_F_USED(!wrap_counter));
+		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
+		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
+				total_sg * sizeof(struct vring_packed_desc));
+		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
+	}
+
+	/* We're using some buffers from the free list. */
+	vq->vq.num_free -= descs_used;
+
+	/* Update free pointer */
+	if (indirect) {
+		n = head + 1;
+		if (n >= vq->vring_packed.num) {
+			n = 0;
+			vq->wrap_counter ^= 1;
+		}
+		vq->free_head = n;
+	} else
+		vq->free_head = i;
+
+	/* Store token and indirect buffer state. */
+	vq->desc_state[head].num = descs_used;
+	vq->desc_state[head].data = data;
+	if (indirect)
+		vq->desc_state[head].indir_desc = desc;
+	else
+		vq->desc_state[head].indir_desc = ctx;
+
+	virtio_wmb(vq->weak_barriers);
+	vq->vring_packed.desc[head].flags = head_flags;
+	vq->num_added++;
+
+	pr_debug("Added buffer head %i to %p\n", head, vq);
+	END_USE(vq);
+
+	return 0;
+
+unmap_release:
+	err_idx = i;
+	i = head;
+
+	for (n = 0; n < total_sg; n++) {
+		if (i == err_idx)
+			break;
+		vring_unmap_one(vq, &desc[i]);
+		i++;
+		if (!indirect && i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->wrap_counter = wrap_counter;
+
+	if (indirect)
+		kfree(desc);
+
+	END_USE(vq);
+	return -EIO;
+}
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				unsigned int total_sg,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				void *ctx,
+				gfp_t gfp)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
+						 in_sgs, data, ctx, gfp) :
+			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
+						in_sgs, data, ctx, gfp);
+}
+
 /**
  * virtqueue_add_sgs - expose buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -561,6 +845,12 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	 * event. */
 	virtio_mb(vq->weak_barriers);
 
+	if (vq->packed) {
+		/* FIXME: to be implemented */
+		needs_kick = true;
+		goto out;
+	}
+
 	old = vq->avail_idx_shadow - vq->num_added;
 	new = vq->avail_idx_shadow;
 	vq->num_added = 0;
@@ -579,6 +869,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	} else {
 		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
 	}
+
+out:
 	END_USE(vq);
 	return needs_kick;
 }
@@ -628,8 +920,8 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
-		       void **ctx)
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+			     void **ctx)
 {
 	unsigned int i, j;
 	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
@@ -677,29 +969,81 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
 	}
 }
 
-static inline bool more_used(const struct vring_virtqueue *vq)
+static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+			      void **ctx)
+{
+	struct vring_packed_desc *desc;
+	unsigned int i, j;
+
+	/* Clear data ptr. */
+	vq->desc_state[head].data = NULL;
+
+	i = head;
+
+	for (j = 0; j < vq->desc_state[head].num; j++) {
+		desc = &vq->vring_packed.desc[i];
+		vring_unmap_one(vq, desc);
+		i++;
+		if (i >= vq->vring_packed.num)
+			i = 0;
+	}
+
+	vq->vq.num_free += vq->desc_state[head].num;
+
+	if (vq->indirect) {
+		u32 len;
+
+		desc = vq->desc_state[head].indir_desc;
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev,
+				      vq->vring_packed.desc[head].len);
+
+		BUG_ON(!(vq->vring_packed.desc[head].flags &
+			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
+
+		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
+			vring_unmap_one(vq, &desc[j]);
+
+		kfree(desc);
+		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
+	}
+}
+
+static inline bool more_used_split(const struct vring_virtqueue *vq)
 {
 	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
 }
 
-/**
- * virtqueue_get_buf - get the next used buffer
- * @vq: the struct virtqueue we're talking about.
- * @len: the length written into the buffer
- *
- * If the device wrote data into the buffer, @len will be set to the
- * amount written.  This means you don't need to clear the buffer
- * beforehand to ensure there's no data leakage in the case of short
- * writes.
- *
- * Caller must ensure we don't call this with other virtqueue
- * operations at the same time (except where noted).
- *
- * Returns NULL if there are no used buffers, or the "data" token
- * handed to virtqueue_add_*().
- */
-void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
-			    void **ctx)
+static inline bool more_used_packed(const struct vring_virtqueue *vq)
+{
+	u16 last_used, flags;
+	bool avail, used;
+
+	if (vq->vq.num_free == vq->vring.num)
+		return false;
+
+	last_used = vq->last_used_idx;
+	flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[last_used].flags);
+	avail = flags & VRING_DESC_F_AVAIL(1);
+	used = flags & VRING_DESC_F_USED(1);
+
+	return avail == used;
+}
+
+static inline bool more_used(const struct vring_virtqueue *vq)
+{
+	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
+}
+
+void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
+				  void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	void *ret;
@@ -735,9 +1079,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
 		return NULL;
 	}
 
-	/* detach_buf clears data, so grab it now. */
+	/* detach_buf_split clears data, so grab it now. */
 	ret = vq->desc_state[i].data;
-	detach_buf(vq, i, ctx);
+	detach_buf_split(vq, i, ctx);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
 	 * by writing event index and flush out the write before
@@ -754,6 +1098,87 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
 	END_USE(vq);
 	return ret;
 }
+
+void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
+				   void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	u16 last_used;
+
+	START_USE(vq);
+
+	if (unlikely(vq->broken)) {
+		END_USE(vq);
+		return NULL;
+	}
+
+	if (!more_used(vq)) {
+		pr_debug("No more buffers in queue\n");
+		END_USE(vq);
+		return NULL;
+	}
+
+	/* Only get used array entries after they have been exposed by host. */
+	virtio_rmb(vq->weak_barriers);
+
+	last_used = vq->last_used_idx;
+
+	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
+	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
+
+	if (unlikely(i >= vq->vring_packed.num)) {
+		BAD_RING(vq, "id %u out of range\n", i);
+		return NULL;
+	}
+	if (unlikely(!vq->desc_state[i].data)) {
+		BAD_RING(vq, "id %u is not a head!\n", i);
+		return NULL;
+	}
+
+	/* detach_buf_packed clears data, so grab it now. */
+	ret = vq->desc_state[i].data;
+	detach_buf_packed(vq, i, ctx);
+
+	vq->last_used_idx += vq->desc_state[i].num;
+	if (vq->last_used_idx >= vq->vring_packed.num)
+		vq->last_used_idx %= vq->vring_packed.num;
+
+	// FIXME: implement the desc event support
+
+#ifdef DEBUG
+	vq->last_add_time_valid = false;
+#endif
+
+	END_USE(vq);
+	return ret;
+}
+
+/**
+ * virtqueue_get_buf - get the next used buffer
+ * @vq: the struct virtqueue we're talking about.
+ * @len: the length written into the buffer
+ *
+ * If the device wrote data into the buffer, @len will be set to the
+ * amount written.  This means you don't need to clear the buffer
+ * beforehand to ensure there's no data leakage in the case of short
+ * writes.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ *
+ * Returns NULL if there are no used buffers, or the "data" token
+ * handed to virtqueue_add_*().
+ */
+void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
+			    void **ctx)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
+}
 EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
 void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
@@ -761,6 +1186,24 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	return virtqueue_get_buf_ctx(_vq, len, NULL);
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_buf);
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		if (!vq->event)
+			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev,
+							vq->avail_flags_shadow);
+	}
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+	// FIXME: to be implemented
+}
+
 /**
  * virtqueue_disable_cb - disable callbacks
  * @vq: the struct virtqueue we're talking about.
@@ -774,12 +1217,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
-	}
-
+	if (vq->packed)
+		virtqueue_disable_cb_packed(_vq);
+	else
+		virtqueue_disable_cb_split(_vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
@@ -802,6 +1243,12 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 
 	START_USE(vq);
 
+	if (vq->packed) {
+		// FIXME: to be implemented
+		last_used_idx = vq->last_used_idx;
+		goto out;
+	}
+
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
 	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
@@ -813,6 +1260,7 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
 	}
 	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
+out:
 	END_USE(vq);
 	return last_used_idx;
 }
@@ -832,6 +1280,12 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
 	virtio_mb(vq->weak_barriers);
+	if (vq->packed) {
+		u16 flags = virtio16_to_cpu(vq->vq.vdev,
+				vq->vring_packed.desc[last_used_idx].flags);
+		return !(flags & VRING_DESC_F_AVAIL(1)) ==
+		       !(flags & VRING_DESC_F_USED(1));
+	}
 	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
 }
 EXPORT_SYMBOL_GPL(virtqueue_poll);
@@ -874,6 +1328,11 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 
 	START_USE(vq);
 
+	if (vq->packed) {
+		// FIXME: to be implemented
+		goto out;
+	}
+
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
 	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
@@ -896,6 +1355,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 		return false;
 	}
 
+out:
 	END_USE(vq);
 	return true;
 }
@@ -922,14 +1382,20 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 			continue;
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->desc_state[i].data;
-		detach_buf(vq, i, NULL);
-		vq->avail_idx_shadow--;
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
+		if (vq->packed)
+			detach_buf_packed(vq, i, NULL);
+		else {
+			detach_buf_split(vq, i, NULL);
+			vq->avail_idx_shadow--;
+			vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev,
+							vq->avail_idx_shadow);
+		}
 		END_USE(vq);
 		return buf;
 	}
 	/* That should have freed everything. */
-	BUG_ON(vq->vq.num_free != vq->vring.num);
+	BUG_ON(vq->vq.num_free != (vq->packed ? vq->vring_packed.num :
+						vq->vring.num));
 
 	END_USE(vq);
 	return NULL;
@@ -957,7 +1423,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool context,
@@ -965,19 +1432,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					void (*callback)(struct virtqueue *),
 					const char *name)
 {
-	unsigned int i;
+	unsigned int num, i;
 	struct vring_virtqueue *vq;
 
-	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+	num = packed ? vring.vring_packed.num : vring.vring_split.num;
+
+	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
 		     GFP_KERNEL);
 	if (!vq)
 		return NULL;
 
-	vq->vring = vring;
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
-	vq->vq.num_free = vring.num;
+	vq->vq.num_free = num;
 	vq->vq.index = index;
 	vq->we_own_ring = false;
 	vq->queue_dma_addr = 0;
@@ -986,9 +1454,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
-	vq->avail_flags_shadow = 0;
-	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
+	vq->packed = packed;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
@@ -999,18 +1466,41 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
+	if (vq->packed) {
+		vq->vring_packed = vring.vring_packed;
+		vq->free_head = 0;
+		vq->wrap_counter = 1;
+
+#if 0
+		vq->chaining = virtio_has_feature(vdev,
+						  VIRTIO_RING_F_LIST_DESC);
+#else
+		vq->chaining = true;
+#endif
+	} else {
+		vq->vring = vring.vring_split;
+		vq->avail_flags_shadow = 0;
+		vq->avail_idx_shadow = 0;
+
+		/* Put everything in free lists. */
+		vq->free_head = 0;
+		for (i = 0; i < num-1; i++)
+			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
+	}
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback) {
-		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-		if (!vq->event)
-			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+		if (packed) {
+			// FIXME: to be implemented
+		} else {
+			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+			if (!vq->event)
+				vq->vring.avail->flags = cpu_to_virtio16(vdev,
+						vq->avail_flags_shadow);
+		}
 	}
 
-	/* Put everything in free lists. */
-	vq->free_head = 0;
-	for (i = 0; i < vring.num-1; i++)
-		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
 
 	return &vq->vq;
 }
@@ -1058,6 +1548,14 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
 	}
 }
 
+static inline int
+__vring_size(unsigned int num, unsigned long align, bool packed)
+{
+	if (packed)
+		return vring_packed_size(num, align);
+	return vring_size(num, align);
+}
+
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
 	unsigned int num,
@@ -1074,7 +1572,8 @@ struct virtqueue *vring_create_virtqueue(
 	void *queue = NULL;
 	dma_addr_t dma_addr;
 	size_t queue_size_in_bytes;
-	struct vring vring;
+	union vring_union vring;
+	bool packed;
 
 	/* We assume num is a power of 2. */
 	if (num & (num - 1)) {
@@ -1082,9 +1581,13 @@ struct virtqueue *vring_create_virtqueue(
 		return NULL;
 	}
 
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+
 	/* TODO: allocate each queue chunk individually */
-	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
+			num /= 2) {
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr,
 					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
 		if (queue)
@@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
 
 	if (!queue) {
 		/* Try to get a single page. You are my only hope! */
-		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+							     packed),
 					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
 	}
 	if (!queue)
 		return NULL;
 
-	queue_size_in_bytes = vring_size(num, vring_align);
-	vring_init(&vring, num, queue, vring_align);
+	queue_size_in_bytes = __vring_size(num, vring_align, packed);
+	if (packed)
+		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
+	else
+		vring_init(&vring.vring_split, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				   notify, callback, name);
+	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				   context, notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
 				 dma_addr);
@@ -1132,10 +1639,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name)
 {
-	struct vring vring;
-	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
-				     notify, callback, name);
+	union vring_union vring;
+	bool packed;
+
+	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+	if (packed)
+		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
+	else
+		vring_init(&vring.vring_split, num, pages, vring_align);
+
+	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+				     context, notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 
@@ -1145,7 +1659,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 
 	if (vq->we_own_ring) {
 		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
-				 vq->vring.desc, vq->queue_dma_addr);
+				 vq->packed ? (void *)vq->vring_packed.desc :
+					      (void *)vq->vring.desc,
+				 vq->queue_dma_addr);
 	}
 	list_del(&_vq->list);
 	kfree(vq);
@@ -1159,14 +1675,18 @@ void vring_transport_features(struct virtio_device *vdev)
 
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
 		switch (i) {
+#if 0 // FIXME: to be implemented
 		case VIRTIO_RING_F_INDIRECT_DESC:
 			break;
+#endif
 		case VIRTIO_RING_F_EVENT_IDX:
 			break;
 		case VIRTIO_F_VERSION_1:
 			break;
 		case VIRTIO_F_IOMMU_PLATFORM:
 			break;
+		case VIRTIO_F_RING_PACKED:
+			break;
 		default:
 			/* We don't understand this bit. */
 			__virtio_clear_bit(vdev, i);
@@ -1187,7 +1707,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	return vq->vring.num;
+	return vq->packed ? vq->vring_packed.num : vq->vring.num;
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
@@ -1224,6 +1744,7 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
 
+/* Only available for split ring */
 dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -1235,6 +1756,7 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
 
+/* Only available for split ring */
 dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -1246,6 +1768,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
 
+/* Only available for split ring */
 const struct vring *virtqueue_get_vring(struct virtqueue *vq)
 {
 	return &to_vvq(vq)->vring;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..a0075894ad16 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
 struct virtio_device;
 struct virtqueue;
 
+union vring_union {
+	struct vring vring_split;
+	struct vring_packed vring_packed;
+};
+
 /*
  * Creates a virtqueue and allocates the descriptor ring.  If
  * may_reduce_num is set, then this may allocate a smaller ring than
@@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 
 /* Creates a virtqueue with a custom layout. */
 struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
+					union vring_union vring,
+					bool packed,
 					struct virtio_device *vdev,
 					bool weak_barriers,
 					bool ctx,
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC 1/2] virtio: introduce packed ring defines
From: Tiwei Bie @ 2018-02-23 11:18 UTC (permalink / raw)
  To: mst, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180223111801.15240-1-tiwei.bie@intel.com>

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 include/uapi/linux/virtio_config.h | 18 +++++++++-
 include/uapi/linux/virtio_ring.h   | 68 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..e3d077ef5207 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		34
+#define VIRTIO_TRANSPORT_F_END		37
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,20 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM		33
+
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED		34
+
+/*
+ * This feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
+/*
+ * This feature indicates that drivers pass extra data (besides
+ * identifying the Virtqueue) in their device notifications.
+ */
+#define VIRTIO_F_NOTIFICATION_DATA	36
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..77b1d4aeef72 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,9 @@
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT	4
 
+#define VRING_DESC_F_AVAIL(b)	((b) << 7)
+#define VRING_DESC_F_USED(b)	((b) << 15)
+
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
  * will still kick if it's out of buffers. */
@@ -104,6 +107,36 @@ struct vring {
 	struct vring_used *used;
 };
 
+struct vring_packed_desc_event {
+	/* Descriptor Event Offset */
+	__virtio16 desc_event_off   : 15,
+	/* Descriptor Event Wrap Counter */
+		   desc_event_wrap  : 1;
+	/* Descriptor Event Flags */
+	__virtio16 desc_event_flags : 2;
+};
+
+struct vring_packed_desc {
+	/* Buffer Address. */
+	__virtio64 addr;
+	/* Buffer Length. */
+	__virtio32 len;
+	/* Buffer ID. */
+	__virtio16 id;
+	/* The flags depending on descriptor type. */
+	__virtio16 flags;
+};
+
+struct vring_packed {
+	unsigned int num;
+
+	struct vring_packed_desc *desc;
+
+	struct vring_packed_desc_event *driver;
+
+	struct vring_packed_desc_event *device;
+};
+
 /* Alignment requirements for vring elements.
  * When using pre-virtio 1.0 layout, these fall out naturally.
  */
@@ -171,4 +204,39 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
 	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
 }
 
+/* The standard layout for the packed ring is a continuous chunk of memory
+ * which looks like this.
+ *
+ * struct vring_packed
+ * {
+ *	// The actual descriptors (16 bytes each)
+ *	struct vring_packed_desc desc[num];
+ *
+ *	// Padding to the next align boundary.
+ *	char pad[];
+ *
+ *	// Driver Event Suppression
+ *	struct vring_packed_desc_event driver;
+ *
+ *	// Device Event Suppression
+ *	struct vring_packed_desc_event device;
+ * };
+ */
+
+static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
+				     void *p, unsigned long align)
+{
+	vr->num = num;
+	vr->desc = p;
+	vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
+		* num + align - 1) & ~(align - 1));
+	vr->device = vr->driver + 1;
+}
+
+static inline unsigned vring_packed_size(unsigned int num, unsigned long align)
+{
+	return ((sizeof(struct vring_packed_desc) * num + align - 1)
+		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
+}
+
 #endif /* _UAPI_LINUX_VIRTIO_RING_H */
-- 
2.14.1

^ permalink raw reply related

* [PATCH RFC 0/2] Packed ring for virtio
From: Tiwei Bie @ 2018-02-23 11:17 UTC (permalink / raw)
  To: mst, virtualization, linux-kernel, netdev; +Cc: wexu

Hello everyone,

This RFC implements a subset of packed ring which is described at
https://github.com/oasis-tcs/virtio-docs/blob/master/virtio-v1.1-packed-wd08.pdf

The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
Minor changes are needed for the vhost code, e.g. to kick the guest.

It's not a complete implementation, here is what's missing:

- Device area and driver area
- VIRTIO_RING_F_INDIRECT_DESC
- VIRTIO_F_NOTIFICATION_DATA
- Virtio devices except net are not tested
- See FIXME in the code for more details

Thanks!

Best regards,
Tiwei Bie

Tiwei Bie (2):
  virtio: introduce packed ring defines
  virtio_ring: support packed ring

 drivers/virtio/virtio_ring.c       | 699 ++++++++++++++++++++++++++++++++-----
 include/linux/virtio_ring.h        |   8 +-
 include/uapi/linux/virtio_config.h |  18 +-
 include/uapi/linux/virtio_ring.h   |  68 ++++
 4 files changed, 703 insertions(+), 90 deletions(-)

-- 
2.14.1

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-22 21:30 UTC (permalink / raw)
  To: Jiri Pirko, Stephen Hemminger
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180222081115.GC1994@nanopsycho>

On Thu, Feb 22, 2018 at 12:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, Feb 21, 2018 at 09:57:09PM CET, alexander.duyck@gmail.com wrote:
>>On Wed, Feb 21, 2018 at 11:38 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Wed, Feb 21, 2018 at 06:56:35PM CET, alexander.duyck@gmail.com wrote:
>>>>On Wed, Feb 21, 2018 at 8:58 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>> Wed, Feb 21, 2018 at 05:49:49PM CET, alexander.duyck@gmail.com wrote:
>>>>>>On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>>> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>>>>>>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>>>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>>>>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>>>>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>>>>>>>> stuck with this ugly thing forever...
>>>>>>>>>>>
>>>>>>>>>>> Could you at least make some common code that is shared in between
>>>>>>>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>>>>>>>
>>>>>>>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>>>>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>>>>>>>Let's not make a far, far more commonly deployed and important driver
>>>>>>>>>>(virtio) bug-compatible with netvsc.
>>>>>>>>>
>>>>>>>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>>>>>>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>>>>>>>> and make the solution based on team/bond.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>>>>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>>>>>>>user space.  I think it may very well get done in next versions of NM,
>>>>>>>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>>>>>>>using NM.
>>>>>>>>>
>>>>>>>>> Can be done in NM, networkd or other network management tools.
>>>>>>>>> Even easier to do this in teamd and let them all benefit.
>>>>>>>>>
>>>>>>>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>>>>>>>> and half.
>>>>>>>>>
>>>>>>>>> You can just run teamd with config option "kidnap" like this:
>>>>>>>>> # teamd/teamd -c '{"kidnap": true }'
>>>>>>>>>
>>>>>>>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>>>>>>>> or whenever teamd sees another netdev to change mac to his,
>>>>>>>>> it enslaves it.
>>>>>>>>>
>>>>>>>>> Here's the patch (quick and dirty):
>>>>>>>>>
>>>>>>>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>>>>>>>
>>>>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>>>>
>>>>>>>>So this doesn't really address the original problem we were trying to
>>>>>>>>solve. You asked earlier why the netdev name mattered and it mostly
>>>>>>>>has to do with configuration. Specifically what our patch is
>>>>>>>>attempting to resolve is the issue of how to allow a cloud provider to
>>>>>>>>upgrade their customer to SR-IOV support and live migration without
>>>>>>>>requiring them to reconfigure their guest. So the general idea with
>>>>>>>>our patch is to take a VM that is running with virtio_net only and
>>>>>>>>allow it to instead spawn a virtio_bypass master using the same netdev
>>>>>>>>name as the original virtio, and then have the virtio_net and VF come
>>>>>>>>up and be enslaved by the bypass interface. Doing it this way we can
>>>>>>>>allow for multi-vendor SR-IOV live migration support using a guest
>>>>>>>>that was originally configured for virtio only.
>>>>>>>>
>>>>>>>>The problem with your solution is we already have teaming and bonding
>>>>>>>>as you said. There is already a write-up from Red Hat on how to do it
>>>>>>>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>>>>>>>That is all well and good as long as you are willing to keep around
>>>>>>>>two VM images, one for virtio, and one for SR-IOV with live migration.
>>>>>>>
>>>>>>> You don't need 2 images. You need only one. The one with the team setup.
>>>>>>> That's it. If another netdev with the same mac appears, teamd will
>>>>>>> enslave it and run traffic on it. If not, ok, you'll go only through
>>>>>>> virtio_net.
>>>>>>
>>>>>>Isn't that going to cause the routing table to get messed up when we
>>>>>>rearrange the netdevs? We don't want to have an significant disruption
>>>>>> in traffic when we are adding/removing the VF. It seems like we would
>>>>>>need to invalidate any entries that were configured for the virtio_net
>>>>>>and reestablish them on the new team interface. Part of the criteria
>>>>>>we have been working with is that we should be able to transition from
>>>>>>having a VF to not or vice versa without seeing any significant
>>>>>>disruption in the traffic.
>>>>>
>>>>> What? You have routes on the team netdev. virtio_net and VF are only
>>>>> slaves. What are you talking about? I don't get it :/
>>>>
>>>>So lets walk though this by example. The general idea of the base case
>>>>for all this is somebody starting with virtio_net, we will call the
>>>>interface "ens1" for now. It comes up and is assigned a dhcp address
>>>>and everything works as expected. Now in order to get better
>>>>performance we want to add a VF "ens2", but we don't want a new IP
>>>>address. Now if I understand correctly what will happen is that when
>>>>"ens2" appears on the system teamd will then create a new team
>>>>interface "team0". Before teamd can enslave ens1 it has to down the
>>>
>>> No, you don't understand that correctly.
>>>
>>> There is always ens1 and team0. ens1 is a slave of team0. team0 is the
>>> interface to use, to set ip on etc.
>>>
>>> When ens2 appears, it gets enslaved to team0 as well.
>>>
>>>
>>>>interface if I understand things correctly. This means that we have to
>>>>disrupt network traffic in order for this to work.
>>>>
>>>>To give you an idea of where we were before this became about trying
>>>>to do this in the team or bonding driver, we were debating a 2 netdev
>>>>model versus a 3 netdev model. I will call out the model and the
>>>>advantages/disadvantages of those below.
>>>>
>>>>2 Netdev model, "ens1", enslaves "ens2".
>>>>- Requires dropping in-driver XDP in order to work (won't capture VF
>>>>traffic otherwise)
>>>>- VF takes performance hit for extra qdisc/Tx queue lock of virtio_net interface
>>>>- If you ass-u-me (I haven't been a fan of this model if you can't
>>>>tell) that it is okay to rip out in-driver XDP from virtio_net, then
>>>>you could transition between base virtio, virtio w/ backup bit set.
>>>>- Works for netvsc because they limit their features (no in-driver
>>>>XDP) to guarantee this works.
>>>>
>>>>3 Netdev model, "ens1", enslaves "ens1nbackup" and "ens2"
>>>>- Exposes 2 netdevs "ens1" and "ens1nbackup" when only virtio is present
>>>>- No extra qdisc or locking
>>>>- All virtio_net original functionality still present
>>>>- Not able to transition from virtio to virtio w/ backup without
>>>>disruption (requires hot-plug)
>>>>
>>>>The way I see it the only way your team setup could work would be
>>>>something closer to the 3 netdev model. Basically we would be
>>>>requiring the user to always have the team0 present in order to make
>>>>certain that anything like XDP would be run on the team interface
>>>>instead of assuming that the virtio_net could run by itself. I will
>>>>add it as a third option here to compare to the other 2.
>>>
>>> Yes.
>>>
>>>
>>>>
>>>>3 Netdev "team" model, "team0", enslaves "ens1" and "ens2"
>>>>- Requires guest to configure teamd
>>>>- Exposes "team0" and "ens1" when only virtio is present
>>>>- No extra qdisc or locking
>>>>- Doesn't require "backup" bit in virtio
>>>>
>>>>>>
>>>>>>Also how does this handle any static configuration? I am assuming that
>>>>>>everything here assumes the team will be brought up as soon as it is
>>>>>>seen and assigned a DHCP address.
>>>>>
>>>>> Again. You configure whatever you need on the team netdev.
>>>>
>>>>Just so we are clear, are you then saying that the team0 interface
>>>>will always be present with this configuration? You had made it sound
>>>
>>> Of course.
>>>
>>>
>>>>like it would disappear if you didn't have at least 2 interfaces.
>>>
>>> Where did I make it sound like that? No.
>>
>>I think it was a bit of misspeak/misread specifically I am thinking of:
>>  You don't need 2 images. You need only one. The one with the
>>  team setup. That's it. If another netdev with the same mac appears,
>>  teamd will enslave it and run traffic on it. If not, ok, you'll go only
>>  through virtio_net.
>>
>>I read that as there being no team if the VF wasn't present since you
>>would still be going through team and then virtio_net otherwise.
>
> team netdev is always there.
>
>
>>
>>>
>>>>
>>>>>>
>>>>>>The solution as you have proposed seems problematic at best. I don't
>>>>>>see how the team solution works without introducing some sort of
>>>>>>traffic disruption to either add/remove the VF and bring up/tear down
>>>>>>the team interface. At that point we might as well just give up on
>>>>>>this piece of live migration support entirely since the disruption was
>>>>>>what we were trying to avoid. We might as well just hotplug out the VF
>>>>>>and hotplug in a virtio at the same bus device and function number and
>>>>>>just let udev take care of renaming it for us. The idea was supposed
>>>>>>to be a seamless transition between the two interfaces.
>>>>>
>>>>> Alex. What you are trying to do in this patchset and what netvsc does it
>>>>> essentialy in-driver bonding. Same thing mechanism, rx_handler,
>>>>> everything. I don't really understand what are you talking about. With
>>>>> use of team you will get exactly the same behaviour.
>>>>
>>>>So the goal of the "in-driver bonding" is to make the bonding as
>>>>non-intrusive as possible and require as little user intervention as
>>>>possible. I agree that much of the handling is the same, however the
>>>>control structure and requirements are significantly different. That
>>>>has been what I have been trying to explain. You keep wanting to use
>>>>the existing structures, but they don't really apply cleanly because
>>>>they push control for the interface up into the guest, and that
>>>>doesn't make much sense in the case of virtualization. What is
>>>>happening here is that we are exposing a bond that the guest should
>>>>have no control over, or at least as little as possible. In addition
>>>>making the user have to add additional configuration in the guest
>>>>means that there is that much more that can go wrong if they screw it
>>>>up.
>>>>
>>>>The other problem here is that the transition needs to be as seamless
>>>>as possible between just a standard virtio_net setup and this new
>>>>setup. With either the team or bonding setup you end up essentially
>>>>forcing the guest to have the bond/team always there even if they are
>>>>running only a single interface. Only if they "upgrade" the VM by
>>>>adding a VF then it finally gets to do anything.
>>>
>>> Yeah. There is certainly a dilemma. We have to choose between
>>> 1) weird and hackish in-driver semi-bonding that would be simple
>>>    for user.
>>> 2) the standard way that would be perhaps slighly more complicated
>>>    for user.
>>
>>The problem is for us option 2 is quite a bit uglier. Basically it
>>means essentially telling all the distros and such that their cloud
>>images have to use team by default on all virtio_net interfaces. It
>>pretty much means we have to throw away this as a possible solution
>>since you are requiring guest changes that most customers/OS vendors
>>would ever accept.
>>
>>At least with our solution it was the driver making use of the
>>functionality if a given feature bit was set. The teaming solution as
>>proposed doesn't even give us that option.
>
> I understand your motivation.
>
>
>>
>>>>
>>>>What this comes down to for us is the following requirements:
>>>>1. The name of the interface cannot change when going from virtio_net,
>>>>to virtio_net being bypassed using a VF. We cannot create an interface
>>>>on top of the interface, if anything we need to push the original
>>>>virtio_net out of the way so that the new team interface takes its
>>>>place in the configuration of the system. Otherwise a VM with VF w/
>>>>live migration will require a different configuration than one that
>>>>just runs virtio_net.
>>>
>>> Team driver netdev is still the same, no name changes.
>>
>>Right. Basically we need to have the renaming occur so that any
>>existing config gets moved to the upper interface instead of having to
>>rely on configuration being adjusted for the team interface.
>
> The initial name of team netdevice is totally up to you.
>
>
>>
>>>>2. We need some way to signal if this VM should be running in an
>>>>"upgraded" mode or not. We have been using the backup bit in
>>>>virtio_net to do that. If it isn't "upgraded" then we don't need the
>>>>team/bond and we can just run with virtio_net.
>>>
>>> I don't see why the team cannot be there always.
>>
>>It is more the logistical nightmare. Part of the goal here was to work
>>with the cloud base images that are out there such as
>>https://alt.fedoraproject.org/cloud/. With just the kernel changes the
>>overhead for this stays fairly small and would be pulled in as just a
>>standard part of the kernel update process. The virtio bypass only
>>pops up if the backup bit is present. With the team solution it
>>requires that the base image use the team driver on virtio_net when it
>>sees one. I doubt the OSVs would want to do that just because SR-IOV
>>isn't that popular of a case.
>
> Again, I undertand your motivation. Yet I don't like your solution.
> But if the decision is made to do this in-driver bonding. I would like
> to see it baing done some generic way:
> 1) share the same "in-driver bonding core" code with netvsc
>    put to net/core.
> 2) the "in-driver bonding core" will strictly limit the functionality,
>    like active-backup mode only, one vf, one backup, vf netdev type
>    check (so noone could enslave a tap or anything else)
> If user would need something more, he should employ team/bond.

I'll have to do some research and get back to you with our final
decision on this. There was some internal resistance to splitting out
this code as a separate module, but I think it would need to happen in
order to support multiple drivers.

Also I would be curious how Stephen feels about this. Would the
sharing of the dev, and the use of the phys_port_name on the
base/backup netdev work for netvsc? It seems like it should get them
performance gains on the VF, but I am not sure if there are any
specific requirements that mandated that they had to have 2 netdevs.

Thanks.

- Alex

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Samudrala, Sridhar @ 2018-02-22  3:28 UTC (permalink / raw)
  To: Siwei Liu, Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Netdev, virtualization, David Miller
In-Reply-To: <0d158bf6-79b3-442b-2c61-3e900ff40922@intel.com>

On 2/21/2018 6:35 PM, Samudrala, Sridhar wrote:
> On 2/21/2018 5:59 PM, Siwei Liu wrote:
>> On Wed, Feb 21, 2018 at 4:17 PM, Alexander Duyck
>> <alexander.duyck@gmail.com> wrote:
>>> On Wed, Feb 21, 2018 at 3:50 PM, Siwei Liu <loseweigh@gmail.com> wrote:
>>>> I haven't checked emails for days and did not realize the new revision
>>>> had already came out. And thank you for the effort, this revision
>>>> really looks to be a step forward towards our use case and is close to
>>>> what we wanted to do. A few questions in line.
>>>>
>>>> On Sat, Feb 17, 2018 at 9:12 AM, Alexander Duyck
>>>> <alexander.duyck@gmail.com> wrote:
>>>>> On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> 
>>>>> wrote:
>>>>>> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>>>>>>> Ppatch 2 is in response to the community request for a 3 netdev
>>>>>>> solution.  However, it creates some issues we'll get into in a 
>>>>>>> moment.
>>>>>>> It extends virtio_net to use alternate datapath when available and
>>>>>>> registered. When BACKUP feature is enabled, virtio_net driver 
>>>>>>> creates
>>>>>>> an additional 'bypass' netdev that acts as a master device and 
>>>>>>> controls
>>>>>>> 2 slave devices.  The original virtio_net netdev is registered as
>>>>>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>>>>>> registered as 'active' netdev. Both 'bypass' and 'backup' 
>>>>>>> netdevs are
>>>>>>> associated with the same 'pci' device.  The user accesses the 
>>>>>>> network
>>>>>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 
>>>>>>> 'active' netdev
>>>>>>> as default for transmits when it is available with link up and 
>>>>>>> running.
>>>>>> Thank you do doing this.
>>>>>>
>>>>>>> We noticed a couple of issues with this approach during testing.
>>>>>>> - As both 'bypass' and 'backup' netdevs are associated with the 
>>>>>>> same
>>>>>>>    virtio pci device, udev tries to rename both of them with the 
>>>>>>> same name
>>>>>>>    and the 2nd rename will fail. This would be OK as long as the 
>>>>>>> first netdev
>>>>>>>    to be renamed is the 'bypass' netdev, but the order in which 
>>>>>>> udev gets
>>>>>>>    to rename the 2 netdevs is not reliable.
>>>>>> Out of curiosity - why do you link the master netdev to the virtio
>>>>>> struct device?
>>>>> The basic idea of all this is that we wanted this to work with an
>>>>> existing VM image that was using virtio. As such we were trying to
>>>>> make it so that the bypass interface takes the place of the original
>>>>> virtio and get udev to rename the bypass to what the original
>>>>> virtio_net was.
>>>> Could it made it also possible to take over the config from VF instead
>>>> of virtio on an existing VM image? And get udev rename the bypass
>>>> netdev to what the original VF was. I don't say tightly binding the
>>>> bypass master to only virtio or VF, but I think we should provide both
>>>> options to support different upgrade paths. Possibly we could tweak
>>>> the device tree layout to reuse the same PCI slot for the master
>>>> bypass netdev, such that udev would not get confused when renaming the
>>>> device. The VF needs to use a different function slot afterwards.
>>>> Perhaps we might need to a special multiseat like QEMU device for that
>>>> purpose?
>>>>
>>>> Our case we'll upgrade the config from VF to virtio-bypass directly.
>>> So if I am understanding what you are saying you are wanting to flip
>>> the backup interface from the virtio to a VF. The problem is that
>>> becomes a bit of a vendor lock-in solution since it would rely on a
>>> specific VF driver. I would agree with Jiri that we don't want to go
>>> down that path. We don't want every VF out there firing up its own
>>> separate bond. Ideally you want the hypervisor to be able to manage
>>> all of this which is why it makes sense to have virtio manage this and
>>> why this is associated with the virtio_net interface.
>> No, that's not what I was talking about of course. I thought you
>> mentioned the upgrade scenario this patch would like to address is to
>> use the bypass interface "to take the place of the original virtio,
>> and get udev to rename the bypass to what the original virtio_net
>> was". That is one of the possible upgrade paths for sure. However the
>> upgrade path I was seeking is to use the bypass interface to take the
>> place of original VF interface while retaining the name and network
>> configs, which generally can be done simply with kernel upgrade. It
>> would become limiting as this patch makes the bypass interface share
>> the same virtio pci device with virito backup. Can this bypass
>> interface be made general to take place of any pci device other than
>> virtio-net? This will be more helpful as the cloud users who has
>> existing setup on VF interface don't have to recreate it on virtio-net
>> and VF separately again.
>
> Yes. This sounds interesting. Looks like you want an existing VM image 
> with
> VF only configuration to get transparent live migration support by adding
> virtio_net with BACKUP feature.  We may need another feature bit to 
> switch
> between these 2 options.

After thinking some more, this may be more involved than adding a new
feature bit.  This requires a netdev created by virtio to take over the 
name of
a VF netdev associated with a PCI device that may not be plugged in when
the virtio driver is coming up. This definitely requires some new messages
exchanged across the virtio control queue to pass the PCI device info of the
VF netdev.

>
>
>>
>>> The other bits get into more complexity then we are ready to handle
>>> for now. I think I might have talked about something similar that I
>>> was referring to as a "virtio-bond" where you would have a PCI/PCIe
>>> tree topology that makes this easier to sort out, and the "virtio-bond
>>> would be used to handle coordination/configuration of a much more
>>> complex interface.
>> That was one way to solve this problem but I'd like to see simple ways
>> to sort it out.
>>
>>>>>> FWIW two solutions that immediately come to mind is to export 
>>>>>> "backup"
>>>>>> as phys_port_name of the backup virtio link and/or assign a name 
>>>>>> to the
>>>>>> master like you are doing already.  I think team uses team%d and 
>>>>>> bond
>>>>>> uses bond%d, soft naming of master devices seems quite natural in 
>>>>>> this
>>>>>> case.
>>>>> I figured I had overlooked something like that.. Thanks for pointing
>>>>> this out. Okay so I think the phys_port_name approach might resolve
>>>>> the original issue. If I am reading things correctly what we end up
>>>>> with is the master showing up as "ens1" for example and the backup
>>>>> showing up as "ens1nbackup". Am I understanding that right?
>>>>>
>>>>> The problem with the team/bond%d approach is that it creates a new
>>>>> netdevice and so it would require guest configuration changes.
>>>>>
>>>>>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>>>>>> link is quite neat.
>>>>> I agree. For non-"backup" virio_net devices would it be okay for 
>>>>> us to
>>>>> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>>>>> behavior could be maintained although the function still exists.
>>>>>
>>>>>>> - When the 'active' netdev is unplugged OR not present on a 
>>>>>>> destination
>>>>>>>    system after live migration, the user will see 2 virtio_net 
>>>>>>> netdevs.
>>>>>> That's necessary and expected, all configuration applies to the 
>>>>>> master
>>>>>> so master must exist.
>>>>> With the naming issue resolved this is the only item left 
>>>>> outstanding.
>>>>> This becomes a matter of form vs function.
>>>>>
>>>>> The main complaint about the "3 netdev" solution is a bit 
>>>>> confusing to
>>>>> have the 2 netdevs present if the VF isn't there. The idea is that
>>>>> having the extra "master" netdev there if there isn't really a 
>>>>> bond is
>>>>> a bit ugly.
>>>> Is it this uglier in terms of user experience rather than
>>>> functionality? I don't want it dynamically changed between 2-netdev
>>>> and 3-netdev depending on the presence of VF. That gets back to my
>>>> original question and suggestion earlier: why not just hide the lower
>>>> netdevs from udev renaming and such? Which important observability
>>>> benefits users may get if exposing the lower netdevs?
>>>>
>>>> Thanks,
>>>> -Siwei
>>> The only real advantage to a 2 netdev solution is that it looks like
>>> the netvsc solution, however it doesn't behave like it since there are
>>> some features like XDP that may not function correctly if they are
>>> left enabled in the virtio_net interface.
>>>
>>> As far as functionality the advantage of not hiding the lower devices
>>> is that they are free to be managed. The problem with pushing all of
>>> the configuration into the upper device is that you are limited to the
>>> intersection of the features of the lower devices. This can be
>>> limiting for some setups as some VFs support things like more queues,
>>> or better interrupt moderation options than others so trying to make
>>> everything work with one config would be ugly.
>> It depends on how you build it and the way you expect it to work. IMHO
>> the lower devices don't need to be directly managed at all, otherwise
>> it ends up with loss of configuration across migration, and it really
>> does not bring much value than having a general team or bond device.
>> Users still have to reconfigure those queue settings and interrupt
>> moderation options after all. The new upper device could take the
>> assumption that the VF/PT lower device always has superior feature set
>> than virtio-net in order to apply advanced configuration. The upper
>> device should remember all configurations previously done and apply
>> supporting ones to active device automatically when switching the
>> datapath.
>>
> It should be possible to extend this patchset to support migration of 
> additional
> settings  by enabling additional ndo_ops and ethtool_ops on the upper dev
> and propagating them down the lower devices and replaying the settings 
> after
> the VF is replugged after migration.
>
> Thanks
> Sridhar

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Samudrala, Sridhar @ 2018-02-22  2:35 UTC (permalink / raw)
  To: Siwei Liu, Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Netdev, virtualization, David Miller
In-Reply-To: <CADGSJ23WFYEYo2DpSPVfZsOLjoSpgWWVCtEZi84wrUHN-noxag@mail.gmail.com>

On 2/21/2018 5:59 PM, Siwei Liu wrote:
> On Wed, Feb 21, 2018 at 4:17 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> On Wed, Feb 21, 2018 at 3:50 PM, Siwei Liu <loseweigh@gmail.com> wrote:
>>> I haven't checked emails for days and did not realize the new revision
>>> had already came out. And thank you for the effort, this revision
>>> really looks to be a step forward towards our use case and is close to
>>> what we wanted to do. A few questions in line.
>>>
>>> On Sat, Feb 17, 2018 at 9:12 AM, Alexander Duyck
>>> <alexander.duyck@gmail.com> wrote:
>>>> On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>>> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>>>>>> Ppatch 2 is in response to the community request for a 3 netdev
>>>>>> solution.  However, it creates some issues we'll get into in a moment.
>>>>>> It extends virtio_net to use alternate datapath when available and
>>>>>> registered. When BACKUP feature is enabled, virtio_net driver creates
>>>>>> an additional 'bypass' netdev that acts as a master device and controls
>>>>>> 2 slave devices.  The original virtio_net netdev is registered as
>>>>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>>>>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>>>> associated with the same 'pci' device.  The user accesses the network
>>>>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>>>> as default for transmits when it is available with link up and running.
>>>>> Thank you do doing this.
>>>>>
>>>>>> We noticed a couple of issues with this approach during testing.
>>>>>> - As both 'bypass' and 'backup' netdevs are associated with the same
>>>>>>    virtio pci device, udev tries to rename both of them with the same name
>>>>>>    and the 2nd rename will fail. This would be OK as long as the first netdev
>>>>>>    to be renamed is the 'bypass' netdev, but the order in which udev gets
>>>>>>    to rename the 2 netdevs is not reliable.
>>>>> Out of curiosity - why do you link the master netdev to the virtio
>>>>> struct device?
>>>> The basic idea of all this is that we wanted this to work with an
>>>> existing VM image that was using virtio. As such we were trying to
>>>> make it so that the bypass interface takes the place of the original
>>>> virtio and get udev to rename the bypass to what the original
>>>> virtio_net was.
>>> Could it made it also possible to take over the config from VF instead
>>> of virtio on an existing VM image? And get udev rename the bypass
>>> netdev to what the original VF was. I don't say tightly binding the
>>> bypass master to only virtio or VF, but I think we should provide both
>>> options to support different upgrade paths. Possibly we could tweak
>>> the device tree layout to reuse the same PCI slot for the master
>>> bypass netdev, such that udev would not get confused when renaming the
>>> device. The VF needs to use a different function slot afterwards.
>>> Perhaps we might need to a special multiseat like QEMU device for that
>>> purpose?
>>>
>>> Our case we'll upgrade the config from VF to virtio-bypass directly.
>> So if I am understanding what you are saying you are wanting to flip
>> the backup interface from the virtio to a VF. The problem is that
>> becomes a bit of a vendor lock-in solution since it would rely on a
>> specific VF driver. I would agree with Jiri that we don't want to go
>> down that path. We don't want every VF out there firing up its own
>> separate bond. Ideally you want the hypervisor to be able to manage
>> all of this which is why it makes sense to have virtio manage this and
>> why this is associated with the virtio_net interface.
> No, that's not what I was talking about of course. I thought you
> mentioned the upgrade scenario this patch would like to address is to
> use the bypass interface "to take the place of the original virtio,
> and get udev to rename the bypass to what the original virtio_net
> was". That is one of the possible upgrade paths for sure. However the
> upgrade path I was seeking is to use the bypass interface to take the
> place of original VF interface while retaining the name and network
> configs, which generally can be done simply with kernel upgrade. It
> would become limiting as this patch makes the bypass interface share
> the same virtio pci device with virito backup. Can this bypass
> interface be made general to take place of any pci device other than
> virtio-net? This will be more helpful as the cloud users who has
> existing setup on VF interface don't have to recreate it on virtio-net
> and VF separately again.

Yes. This sounds interesting. Looks like you want an existing VM image with
VF only configuration to get transparent live migration support by adding
virtio_net with BACKUP feature.  We may need another feature bit to switch
between these 2 options.


>
>> The other bits get into more complexity then we are ready to handle
>> for now. I think I might have talked about something similar that I
>> was referring to as a "virtio-bond" where you would have a PCI/PCIe
>> tree topology that makes this easier to sort out, and the "virtio-bond
>> would be used to handle coordination/configuration of a much more
>> complex interface.
> That was one way to solve this problem but I'd like to see simple ways
> to sort it out.
>
>>>>> FWIW two solutions that immediately come to mind is to export "backup"
>>>>> as phys_port_name of the backup virtio link and/or assign a name to the
>>>>> master like you are doing already.  I think team uses team%d and bond
>>>>> uses bond%d, soft naming of master devices seems quite natural in this
>>>>> case.
>>>> I figured I had overlooked something like that.. Thanks for pointing
>>>> this out. Okay so I think the phys_port_name approach might resolve
>>>> the original issue. If I am reading things correctly what we end up
>>>> with is the master showing up as "ens1" for example and the backup
>>>> showing up as "ens1nbackup". Am I understanding that right?
>>>>
>>>> The problem with the team/bond%d approach is that it creates a new
>>>> netdevice and so it would require guest configuration changes.
>>>>
>>>>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>>>>> link is quite neat.
>>>> I agree. For non-"backup" virio_net devices would it be okay for us to
>>>> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>>>> behavior could be maintained although the function still exists.
>>>>
>>>>>> - When the 'active' netdev is unplugged OR not present on a destination
>>>>>>    system after live migration, the user will see 2 virtio_net netdevs.
>>>>> That's necessary and expected, all configuration applies to the master
>>>>> so master must exist.
>>>> With the naming issue resolved this is the only item left outstanding.
>>>> This becomes a matter of form vs function.
>>>>
>>>> The main complaint about the "3 netdev" solution is a bit confusing to
>>>> have the 2 netdevs present if the VF isn't there. The idea is that
>>>> having the extra "master" netdev there if there isn't really a bond is
>>>> a bit ugly.
>>> Is it this uglier in terms of user experience rather than
>>> functionality? I don't want it dynamically changed between 2-netdev
>>> and 3-netdev depending on the presence of VF. That gets back to my
>>> original question and suggestion earlier: why not just hide the lower
>>> netdevs from udev renaming and such? Which important observability
>>> benefits users may get if exposing the lower netdevs?
>>>
>>> Thanks,
>>> -Siwei
>> The only real advantage to a 2 netdev solution is that it looks like
>> the netvsc solution, however it doesn't behave like it since there are
>> some features like XDP that may not function correctly if they are
>> left enabled in the virtio_net interface.
>>
>> As far as functionality the advantage of not hiding the lower devices
>> is that they are free to be managed. The problem with pushing all of
>> the configuration into the upper device is that you are limited to the
>> intersection of the features of the lower devices. This can be
>> limiting for some setups as some VFs support things like more queues,
>> or better interrupt moderation options than others so trying to make
>> everything work with one config would be ugly.
> It depends on how you build it and the way you expect it to work. IMHO
> the lower devices don't need to be directly managed at all, otherwise
> it ends up with loss of configuration across migration, and it really
> does not bring much value than having a general team or bond device.
> Users still have to reconfigure those queue settings and interrupt
> moderation options after all. The new upper device could take the
> assumption that the VF/PT lower device always has superior feature set
> than virtio-net in order to apply advanced configuration. The upper
> device should remember all configurations previously done and apply
> supporting ones to active device automatically when switching the
> datapath.
>
It should be possible to extend this patchset to support migration of 
additional
settings  by enabling additional ndo_ops and ethtool_ops on the upper dev
and propagating them down the lower devices and replaying the settings after
the VF is replugged after migration.

Thanks
Sridhar
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Samudrala, Sridhar @ 2018-02-22  2:15 UTC (permalink / raw)
  To: Jakub Kicinski, Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
	Netdev, virtualization, Siwei Liu, David Miller
In-Reply-To: <20180221180213.30885283@cakuba.netronome.com>

On 2/21/2018 6:02 PM, Jakub Kicinski wrote:
> On Wed, 21 Feb 2018 12:57:09 -0800, Alexander Duyck wrote:
>>> I don't see why the team cannot be there always.
>> It is more the logistical nightmare. Part of the goal here was to work
>> with the cloud base images that are out there such as
>> https://alt.fedoraproject.org/cloud/. With just the kernel changes the
>> overhead for this stays fairly small and would be pulled in as just a
>> standard part of the kernel update process. The virtio bypass only
>> pops up if the backup bit is present. With the team solution it
>> requires that the base image use the team driver on virtio_net when it
>> sees one. I doubt the OSVs would want to do that just because SR-IOV
>> isn't that popular of a case.
> IIUC we need to monitor for a "backup hint", spawn the master, rename it
> to maintain backwards compatibility with no-VF setups and enslave the VF
> if it appears.
>
> All those sound possible from user space, the advantage of the kernel
> solution right now is that it has more complete code.
>
> Am I misunderstanding?

I think there is some misunderstanding about the exact requirement and 
the usecase
we are trying to solve.  If the Guest is allowed to do this 
configuration, we already have
a solution with either bond/team based user space configuration.

This is to enable cloud service providers to provide a accelerated 
datapath by simply
letting to tenants to get their own images with the only requirement to 
enable their
kernels with newer virtio_net driver with BACKUP support and the VF driver.

To recap from an earlier thread, here is a response from Stephen that 
talks about the
requirement for the netvsc solution and we would like to provide similar 
solution for
KVM based cloud deployments.

 > The requirement with Azure accelerated network was that a stock 
distribution image from the
 > store must be able to run unmodified and get accelerated networking.
 >  Not sure if other environments need to work the same, but it would 
be nice.
 >  That meant no additional setup scripts (aka no bonding) and also it must
 > work transparently with hot-plug. Also there are diverse set of 
environments:
 > openstack, cloudinit, network manager and systemd. The solution had 
to not depend
 > on any one of them, but also not break any of them.

Thanks
Sridhar
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jakub Kicinski @ 2018-02-22  2:02 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
	Samudrala, Sridhar, virtualization, Siwei Liu, Netdev,
	David Miller
In-Reply-To: <CAKgT0Ud--R7Mw1_jFckskhGmGLZgZbTZ24TuvJ+UWvm_MAQR9w@mail.gmail.com>

On Wed, 21 Feb 2018 12:57:09 -0800, Alexander Duyck wrote:
> > I don't see why the team cannot be there always.  
> 
> It is more the logistical nightmare. Part of the goal here was to work
> with the cloud base images that are out there such as
> https://alt.fedoraproject.org/cloud/. With just the kernel changes the
> overhead for this stays fairly small and would be pulled in as just a
> standard part of the kernel update process. The virtio bypass only
> pops up if the backup bit is present. With the team solution it
> requires that the base image use the team driver on virtio_net when it
> sees one. I doubt the OSVs would want to do that just because SR-IOV
> isn't that popular of a case.

IIUC we need to monitor for a "backup hint", spawn the master, rename it
to maintain backwards compatibility with no-VF setups and enslave the VF
if it appears.

All those sound possible from user space, the advantage of the kernel
solution right now is that it has more complete code.

Am I misunderstanding?

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Siwei Liu @ 2018-02-22  1:59 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Sridhar Samudrala, virtualization, Netdev,
	David Miller
In-Reply-To: <CAKgT0Uda7Bmzf06RMmD7hCVLr8hHQxaY8Fk7w9Qy=7Vi6NODwQ@mail.gmail.com>

On Wed, Feb 21, 2018 at 4:17 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> On Wed, Feb 21, 2018 at 3:50 PM, Siwei Liu <loseweigh@gmail.com> wrote:
>> I haven't checked emails for days and did not realize the new revision
>> had already came out. And thank you for the effort, this revision
>> really looks to be a step forward towards our use case and is close to
>> what we wanted to do. A few questions in line.
>>
>> On Sat, Feb 17, 2018 at 9:12 AM, Alexander Duyck
>> <alexander.duyck@gmail.com> wrote:
>>> On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>>>>> Ppatch 2 is in response to the community request for a 3 netdev
>>>>> solution.  However, it creates some issues we'll get into in a moment.
>>>>> It extends virtio_net to use alternate datapath when available and
>>>>> registered. When BACKUP feature is enabled, virtio_net driver creates
>>>>> an additional 'bypass' netdev that acts as a master device and controls
>>>>> 2 slave devices.  The original virtio_net netdev is registered as
>>>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>>>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>>> associated with the same 'pci' device.  The user accesses the network
>>>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>>> as default for transmits when it is available with link up and running.
>>>>
>>>> Thank you do doing this.
>>>>
>>>>> We noticed a couple of issues with this approach during testing.
>>>>> - As both 'bypass' and 'backup' netdevs are associated with the same
>>>>>   virtio pci device, udev tries to rename both of them with the same name
>>>>>   and the 2nd rename will fail. This would be OK as long as the first netdev
>>>>>   to be renamed is the 'bypass' netdev, but the order in which udev gets
>>>>>   to rename the 2 netdevs is not reliable.
>>>>
>>>> Out of curiosity - why do you link the master netdev to the virtio
>>>> struct device?
>>>
>>> The basic idea of all this is that we wanted this to work with an
>>> existing VM image that was using virtio. As such we were trying to
>>> make it so that the bypass interface takes the place of the original
>>> virtio and get udev to rename the bypass to what the original
>>> virtio_net was.
>>
>> Could it made it also possible to take over the config from VF instead
>> of virtio on an existing VM image? And get udev rename the bypass
>> netdev to what the original VF was. I don't say tightly binding the
>> bypass master to only virtio or VF, but I think we should provide both
>> options to support different upgrade paths. Possibly we could tweak
>> the device tree layout to reuse the same PCI slot for the master
>> bypass netdev, such that udev would not get confused when renaming the
>> device. The VF needs to use a different function slot afterwards.
>> Perhaps we might need to a special multiseat like QEMU device for that
>> purpose?
>>
>> Our case we'll upgrade the config from VF to virtio-bypass directly.
>
> So if I am understanding what you are saying you are wanting to flip
> the backup interface from the virtio to a VF. The problem is that
> becomes a bit of a vendor lock-in solution since it would rely on a
> specific VF driver. I would agree with Jiri that we don't want to go
> down that path. We don't want every VF out there firing up its own
> separate bond. Ideally you want the hypervisor to be able to manage
> all of this which is why it makes sense to have virtio manage this and
> why this is associated with the virtio_net interface.

No, that's not what I was talking about of course. I thought you
mentioned the upgrade scenario this patch would like to address is to
use the bypass interface "to take the place of the original virtio,
and get udev to rename the bypass to what the original virtio_net
was". That is one of the possible upgrade paths for sure. However the
upgrade path I was seeking is to use the bypass interface to take the
place of original VF interface while retaining the name and network
configs, which generally can be done simply with kernel upgrade. It
would become limiting as this patch makes the bypass interface share
the same virtio pci device with virito backup. Can this bypass
interface be made general to take place of any pci device other than
virtio-net? This will be more helpful as the cloud users who has
existing setup on VF interface don't have to recreate it on virtio-net
and VF separately again.

>
> The other bits get into more complexity then we are ready to handle
> for now. I think I might have talked about something similar that I
> was referring to as a "virtio-bond" where you would have a PCI/PCIe
> tree topology that makes this easier to sort out, and the "virtio-bond
> would be used to handle coordination/configuration of a much more
> complex interface.

That was one way to solve this problem but I'd like to see simple ways
to sort it out.

>
>>>
>>>> FWIW two solutions that immediately come to mind is to export "backup"
>>>> as phys_port_name of the backup virtio link and/or assign a name to the
>>>> master like you are doing already.  I think team uses team%d and bond
>>>> uses bond%d, soft naming of master devices seems quite natural in this
>>>> case.
>>>
>>> I figured I had overlooked something like that.. Thanks for pointing
>>> this out. Okay so I think the phys_port_name approach might resolve
>>> the original issue. If I am reading things correctly what we end up
>>> with is the master showing up as "ens1" for example and the backup
>>> showing up as "ens1nbackup". Am I understanding that right?
>>>
>>> The problem with the team/bond%d approach is that it creates a new
>>> netdevice and so it would require guest configuration changes.
>>>
>>>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>>>> link is quite neat.
>>>
>>> I agree. For non-"backup" virio_net devices would it be okay for us to
>>> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>>> behavior could be maintained although the function still exists.
>>>
>>>>> - When the 'active' netdev is unplugged OR not present on a destination
>>>>>   system after live migration, the user will see 2 virtio_net netdevs.
>>>>
>>>> That's necessary and expected, all configuration applies to the master
>>>> so master must exist.
>>>
>>> With the naming issue resolved this is the only item left outstanding.
>>> This becomes a matter of form vs function.
>>>
>>> The main complaint about the "3 netdev" solution is a bit confusing to
>>> have the 2 netdevs present if the VF isn't there. The idea is that
>>> having the extra "master" netdev there if there isn't really a bond is
>>> a bit ugly.
>>
>> Is it this uglier in terms of user experience rather than
>> functionality? I don't want it dynamically changed between 2-netdev
>> and 3-netdev depending on the presence of VF. That gets back to my
>> original question and suggestion earlier: why not just hide the lower
>> netdevs from udev renaming and such? Which important observability
>> benefits users may get if exposing the lower netdevs?
>>
>> Thanks,
>> -Siwei
>
> The only real advantage to a 2 netdev solution is that it looks like
> the netvsc solution, however it doesn't behave like it since there are
> some features like XDP that may not function correctly if they are
> left enabled in the virtio_net interface.
>
> As far as functionality the advantage of not hiding the lower devices
> is that they are free to be managed. The problem with pushing all of
> the configuration into the upper device is that you are limited to the
> intersection of the features of the lower devices. This can be
> limiting for some setups as some VFs support things like more queues,
> or better interrupt moderation options than others so trying to make
> everything work with one config would be ugly.

It depends on how you build it and the way you expect it to work. IMHO
the lower devices don't need to be directly managed at all, otherwise
it ends up with loss of configuration across migration, and it really
does not bring much value than having a general team or bond device.
Users still have to reconfigure those queue settings and interrupt
moderation options after all. The new upper device could take the
assumption that the VF/PT lower device always has superior feature set
than virtio-net in order to apply advanced configuration. The upper
device should remember all configurations previously done and apply
supporting ones to active device automatically when switching the
datapath.

Regards,
-Siwei

>
> - Alex

^ permalink raw reply

* Re: [PATCH 3/4] iommu/virtio: Add event queue
From: kbuild test robot @ 2018-02-22  1:35 UTC (permalink / raw)
  To: Jean-Philippe Brucker
  Cc: tnowicki, mst, will.deacon, virtualization, jintack,
	eric.auger.pro, virtio-dev, jayachandran.nair, lorenzo.pieralisi,
	kvm, joro, kvmarm, marc.zyngier, eric.auger, iommu, kbuild-all,
	robin.murphy
In-Reply-To: <20180214145340.1223-4-jean-philippe.brucker@arm.com>

[-- Attachment #1: Type: text/plain, Size: 2426 bytes --]

Hi Jean-Philippe,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc2 next-20180221]
[cannot apply to iommu/next]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Jean-Philippe-Brucker/Add-virtio-iommu-driver/20180217-075417
config: parisc-allmodconfig (attached as .config)
compiler: hppa-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=parisc 

All errors (new ones prefixed by >>):

   drivers/iommu/virtio-iommu.o: In function `virtio_iommu_drv_init':
   (.init.text+0x24): undefined reference to `register_virtio_driver'
   drivers/iommu/virtio-iommu.o: In function `viommu_send_reqs_sync':
   (.text.viommu_send_reqs_sync+0xdc): undefined reference to `virtqueue_add_sgs'
   (.text.viommu_send_reqs_sync+0x12c): undefined reference to `virtqueue_kick'
   (.text.viommu_send_reqs_sync+0x29c): undefined reference to `virtqueue_get_buf'
   drivers/iommu/virtio-iommu.o: In function `viommu_event_handler':
>> (.text.viommu_event_handler+0x288): undefined reference to `virtqueue_add_inbuf'
>> (.text.viommu_event_handler+0x2a8): undefined reference to `virtqueue_get_buf'
>> (.text.viommu_event_handler+0x2b8): undefined reference to `virtqueue_kick'
   drivers/iommu/virtio-iommu.o: In function `viommu_probe':
   (.text.viommu_probe+0x1a0): undefined reference to `virtio_check_driver_offered_feature'
   (.text.viommu_probe+0x248): undefined reference to `virtio_check_driver_offered_feature'
   (.text.viommu_probe+0x2ec): undefined reference to `virtio_check_driver_offered_feature'
   (.text.viommu_probe+0x328): undefined reference to `virtio_check_driver_offered_feature'
>> (.text.viommu_probe+0x428): undefined reference to `virtqueue_add_inbuf'
>> (.text.viommu_probe+0x440): undefined reference to `virtqueue_kick'
   drivers/iommu/virtio-iommu.o: In function `virtio_iommu_drv_exit':
   (.exit.text+0x18): undefined reference to `unregister_virtio_driver'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 52870 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-22  0:17 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Sridhar Samudrala, virtualization, Netdev,
	David Miller
In-Reply-To: <CADGSJ217HahFJsGmNXMD6SwWiK374+u_YET8QqdsmofCRFZr=Q@mail.gmail.com>

On Wed, Feb 21, 2018 at 3:50 PM, Siwei Liu <loseweigh@gmail.com> wrote:
> I haven't checked emails for days and did not realize the new revision
> had already came out. And thank you for the effort, this revision
> really looks to be a step forward towards our use case and is close to
> what we wanted to do. A few questions in line.
>
> On Sat, Feb 17, 2018 at 9:12 AM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>>>> Ppatch 2 is in response to the community request for a 3 netdev
>>>> solution.  However, it creates some issues we'll get into in a moment.
>>>> It extends virtio_net to use alternate datapath when available and
>>>> registered. When BACKUP feature is enabled, virtio_net driver creates
>>>> an additional 'bypass' netdev that acts as a master device and controls
>>>> 2 slave devices.  The original virtio_net netdev is registered as
>>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>> associated with the same 'pci' device.  The user accesses the network
>>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>> as default for transmits when it is available with link up and running.
>>>
>>> Thank you do doing this.
>>>
>>>> We noticed a couple of issues with this approach during testing.
>>>> - As both 'bypass' and 'backup' netdevs are associated with the same
>>>>   virtio pci device, udev tries to rename both of them with the same name
>>>>   and the 2nd rename will fail. This would be OK as long as the first netdev
>>>>   to be renamed is the 'bypass' netdev, but the order in which udev gets
>>>>   to rename the 2 netdevs is not reliable.
>>>
>>> Out of curiosity - why do you link the master netdev to the virtio
>>> struct device?
>>
>> The basic idea of all this is that we wanted this to work with an
>> existing VM image that was using virtio. As such we were trying to
>> make it so that the bypass interface takes the place of the original
>> virtio and get udev to rename the bypass to what the original
>> virtio_net was.
>
> Could it made it also possible to take over the config from VF instead
> of virtio on an existing VM image? And get udev rename the bypass
> netdev to what the original VF was. I don't say tightly binding the
> bypass master to only virtio or VF, but I think we should provide both
> options to support different upgrade paths. Possibly we could tweak
> the device tree layout to reuse the same PCI slot for the master
> bypass netdev, such that udev would not get confused when renaming the
> device. The VF needs to use a different function slot afterwards.
> Perhaps we might need to a special multiseat like QEMU device for that
> purpose?
>
> Our case we'll upgrade the config from VF to virtio-bypass directly.

So if I am understanding what you are saying you are wanting to flip
the backup interface from the virtio to a VF. The problem is that
becomes a bit of a vendor lock-in solution since it would rely on a
specific VF driver. I would agree with Jiri that we don't want to go
down that path. We don't want every VF out there firing up its own
separate bond. Ideally you want the hypervisor to be able to manage
all of this which is why it makes sense to have virtio manage this and
why this is associated with the virtio_net interface.

The other bits get into more complexity then we are ready to handle
for now. I think I might have talked about something similar that I
was referring to as a "virtio-bond" where you would have a PCI/PCIe
tree topology that makes this easier to sort out, and the "virtio-bond
would be used to handle coordination/configuration of a much more
complex interface.

>>
>>> FWIW two solutions that immediately come to mind is to export "backup"
>>> as phys_port_name of the backup virtio link and/or assign a name to the
>>> master like you are doing already.  I think team uses team%d and bond
>>> uses bond%d, soft naming of master devices seems quite natural in this
>>> case.
>>
>> I figured I had overlooked something like that.. Thanks for pointing
>> this out. Okay so I think the phys_port_name approach might resolve
>> the original issue. If I am reading things correctly what we end up
>> with is the master showing up as "ens1" for example and the backup
>> showing up as "ens1nbackup". Am I understanding that right?
>>
>> The problem with the team/bond%d approach is that it creates a new
>> netdevice and so it would require guest configuration changes.
>>
>>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>>> link is quite neat.
>>
>> I agree. For non-"backup" virio_net devices would it be okay for us to
>> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>> behavior could be maintained although the function still exists.
>>
>>>> - When the 'active' netdev is unplugged OR not present on a destination
>>>>   system after live migration, the user will see 2 virtio_net netdevs.
>>>
>>> That's necessary and expected, all configuration applies to the master
>>> so master must exist.
>>
>> With the naming issue resolved this is the only item left outstanding.
>> This becomes a matter of form vs function.
>>
>> The main complaint about the "3 netdev" solution is a bit confusing to
>> have the 2 netdevs present if the VF isn't there. The idea is that
>> having the extra "master" netdev there if there isn't really a bond is
>> a bit ugly.
>
> Is it this uglier in terms of user experience rather than
> functionality? I don't want it dynamically changed between 2-netdev
> and 3-netdev depending on the presence of VF. That gets back to my
> original question and suggestion earlier: why not just hide the lower
> netdevs from udev renaming and such? Which important observability
> benefits users may get if exposing the lower netdevs?
>
> Thanks,
> -Siwei

The only real advantage to a 2 netdev solution is that it looks like
the netvsc solution, however it doesn't behave like it since there are
some features like XDP that may not function correctly if they are
left enabled in the virtio_net interface.

As far as functionality the advantage of not hiding the lower devices
is that they are free to be managed. The problem with pushing all of
the configuration into the upper device is that you are limited to the
intersection of the features of the lower devices. This can be
limiting for some setups as some VFs support things like more queues,
or better interrupt moderation options than others so trying to make
everything work with one config would be ugly.

- Alex

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Siwei Liu @ 2018-02-21 23:50 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Sridhar Samudrala, virtualization, Netdev,
	David Miller
In-Reply-To: <CAKgT0UfU9gV2Y7hWFQi0vJoD9kYeoq+7cXmwAjXeV5zxXHa3dQ@mail.gmail.com>

I haven't checked emails for days and did not realize the new revision
had already came out. And thank you for the effort, this revision
really looks to be a step forward towards our use case and is close to
what we wanted to do. A few questions in line.

On Sat, Feb 17, 2018 at 9:12 AM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>>> Ppatch 2 is in response to the community request for a 3 netdev
>>> solution.  However, it creates some issues we'll get into in a moment.
>>> It extends virtio_net to use alternate datapath when available and
>>> registered. When BACKUP feature is enabled, virtio_net driver creates
>>> an additional 'bypass' netdev that acts as a master device and controls
>>> 2 slave devices.  The original virtio_net netdev is registered as
>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>> associated with the same 'pci' device.  The user accesses the network
>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>> as default for transmits when it is available with link up and running.
>>
>> Thank you do doing this.
>>
>>> We noticed a couple of issues with this approach during testing.
>>> - As both 'bypass' and 'backup' netdevs are associated with the same
>>>   virtio pci device, udev tries to rename both of them with the same name
>>>   and the 2nd rename will fail. This would be OK as long as the first netdev
>>>   to be renamed is the 'bypass' netdev, but the order in which udev gets
>>>   to rename the 2 netdevs is not reliable.
>>
>> Out of curiosity - why do you link the master netdev to the virtio
>> struct device?
>
> The basic idea of all this is that we wanted this to work with an
> existing VM image that was using virtio. As such we were trying to
> make it so that the bypass interface takes the place of the original
> virtio and get udev to rename the bypass to what the original
> virtio_net was.

Could it made it also possible to take over the config from VF instead
of virtio on an existing VM image? And get udev rename the bypass
netdev to what the original VF was. I don't say tightly binding the
bypass master to only virtio or VF, but I think we should provide both
options to support different upgrade paths. Possibly we could tweak
the device tree layout to reuse the same PCI slot for the master
bypass netdev, such that udev would not get confused when renaming the
device. The VF needs to use a different function slot afterwards.
Perhaps we might need to a special multiseat like QEMU device for that
purpose?

Our case we'll upgrade the config from VF to virtio-bypass directly.

>
>> FWIW two solutions that immediately come to mind is to export "backup"
>> as phys_port_name of the backup virtio link and/or assign a name to the
>> master like you are doing already.  I think team uses team%d and bond
>> uses bond%d, soft naming of master devices seems quite natural in this
>> case.
>
> I figured I had overlooked something like that.. Thanks for pointing
> this out. Okay so I think the phys_port_name approach might resolve
> the original issue. If I am reading things correctly what we end up
> with is the master showing up as "ens1" for example and the backup
> showing up as "ens1nbackup". Am I understanding that right?
>
> The problem with the team/bond%d approach is that it creates a new
> netdevice and so it would require guest configuration changes.
>
>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>> link is quite neat.
>
> I agree. For non-"backup" virio_net devices would it be okay for us to
> just return -EOPNOTSUPP? I assume it would be and that way the legacy
> behavior could be maintained although the function still exists.
>
>>> - When the 'active' netdev is unplugged OR not present on a destination
>>>   system after live migration, the user will see 2 virtio_net netdevs.
>>
>> That's necessary and expected, all configuration applies to the master
>> so master must exist.
>
> With the naming issue resolved this is the only item left outstanding.
> This becomes a matter of form vs function.
>
> The main complaint about the "3 netdev" solution is a bit confusing to
> have the 2 netdevs present if the VF isn't there. The idea is that
> having the extra "master" netdev there if there isn't really a bond is
> a bit ugly.

Is it this uglier in terms of user experience rather than
functionality? I don't want it dynamically changed between 2-netdev
and 3-netdev depending on the presence of VF. That gets back to my
original question and suggestion earlier: why not just hide the lower
netdevs from udev renaming and such? Which important observability
benefits users may get if exposing the lower netdevs?

Thanks,
-Siwei

>
> The downside of the "2 netdev" solution is that you have to deal with
> an extra layer of locking/queueing to get to the VF and you lose some
> functionality since things like in-driver XDP have to be disabled in
> order to maintain the same functionality when the VF is present or
> not. However it looks more like classic virtio_net when the VF is not
> present.

^ permalink raw reply

* Re: [PATCH 1/4] iommu: Add virtio-iommu driver
From: kbuild test robot @ 2018-02-21 21:08 UTC (permalink / raw)
  To: Jean-Philippe Brucker
  Cc: tnowicki, mst, will.deacon, virtualization, jintack,
	eric.auger.pro, virtio-dev, jayachandran.nair, lorenzo.pieralisi,
	kvm, joro, kvmarm, marc.zyngier, eric.auger, iommu, kbuild-all,
	robin.murphy
In-Reply-To: <20180214145340.1223-2-jean-philippe.brucker@arm.com>

[-- Attachment #1: Type: text/plain, Size: 1715 bytes --]

Hi Jean-Philippe,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc2 next-20180221]
[cannot apply to iommu/next]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Jean-Philippe-Brucker/Add-virtio-iommu-driver/20180217-075417
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=ia64 

All errors (new ones prefixed by >>):

   drivers/iommu/virtio-iommu.o: In function `viommu_send_reqs_sync':
>> virtio-iommu.c:(.text+0xa82): undefined reference to `virtqueue_add_sgs'
>> virtio-iommu.c:(.text+0xb52): undefined reference to `virtqueue_kick'
>> virtio-iommu.c:(.text+0xd82): undefined reference to `virtqueue_get_buf'
   drivers/iommu/virtio-iommu.o: In function `viommu_probe':
>> virtio-iommu.c:(.text+0x23f2): undefined reference to `virtio_check_driver_offered_feature'
   virtio-iommu.c:(.text+0x2572): undefined reference to `virtio_check_driver_offered_feature'
   virtio-iommu.c:(.text+0x26f2): undefined reference to `virtio_check_driver_offered_feature'
   drivers/iommu/virtio-iommu.o: In function `virtio_iommu_drv_init':
>> virtio-iommu.c:(.init.text+0x22): undefined reference to `register_virtio_driver'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 50250 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-21 20:57 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180221193832.GE1996@nanopsycho>

On Wed, Feb 21, 2018 at 11:38 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, Feb 21, 2018 at 06:56:35PM CET, alexander.duyck@gmail.com wrote:
>>On Wed, Feb 21, 2018 at 8:58 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Wed, Feb 21, 2018 at 05:49:49PM CET, alexander.duyck@gmail.com wrote:
>>>>On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>>>>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>>>>>> stuck with this ugly thing forever...
>>>>>>>>>
>>>>>>>>> Could you at least make some common code that is shared in between
>>>>>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>>>>>
>>>>>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>>>>>Let's not make a far, far more commonly deployed and important driver
>>>>>>>>(virtio) bug-compatible with netvsc.
>>>>>>>
>>>>>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>>>>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>>>>>> and make the solution based on team/bond.
>>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>>>>>user space.  I think it may very well get done in next versions of NM,
>>>>>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>>>>>using NM.
>>>>>>>
>>>>>>> Can be done in NM, networkd or other network management tools.
>>>>>>> Even easier to do this in teamd and let them all benefit.
>>>>>>>
>>>>>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>>>>>> and half.
>>>>>>>
>>>>>>> You can just run teamd with config option "kidnap" like this:
>>>>>>> # teamd/teamd -c '{"kidnap": true }'
>>>>>>>
>>>>>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>>>>>> or whenever teamd sees another netdev to change mac to his,
>>>>>>> it enslaves it.
>>>>>>>
>>>>>>> Here's the patch (quick and dirty):
>>>>>>>
>>>>>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>>>>>
>>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>>
>>>>>>So this doesn't really address the original problem we were trying to
>>>>>>solve. You asked earlier why the netdev name mattered and it mostly
>>>>>>has to do with configuration. Specifically what our patch is
>>>>>>attempting to resolve is the issue of how to allow a cloud provider to
>>>>>>upgrade their customer to SR-IOV support and live migration without
>>>>>>requiring them to reconfigure their guest. So the general idea with
>>>>>>our patch is to take a VM that is running with virtio_net only and
>>>>>>allow it to instead spawn a virtio_bypass master using the same netdev
>>>>>>name as the original virtio, and then have the virtio_net and VF come
>>>>>>up and be enslaved by the bypass interface. Doing it this way we can
>>>>>>allow for multi-vendor SR-IOV live migration support using a guest
>>>>>>that was originally configured for virtio only.
>>>>>>
>>>>>>The problem with your solution is we already have teaming and bonding
>>>>>>as you said. There is already a write-up from Red Hat on how to do it
>>>>>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>>>>>That is all well and good as long as you are willing to keep around
>>>>>>two VM images, one for virtio, and one for SR-IOV with live migration.
>>>>>
>>>>> You don't need 2 images. You need only one. The one with the team setup.
>>>>> That's it. If another netdev with the same mac appears, teamd will
>>>>> enslave it and run traffic on it. If not, ok, you'll go only through
>>>>> virtio_net.
>>>>
>>>>Isn't that going to cause the routing table to get messed up when we
>>>>rearrange the netdevs? We don't want to have an significant disruption
>>>> in traffic when we are adding/removing the VF. It seems like we would
>>>>need to invalidate any entries that were configured for the virtio_net
>>>>and reestablish them on the new team interface. Part of the criteria
>>>>we have been working with is that we should be able to transition from
>>>>having a VF to not or vice versa without seeing any significant
>>>>disruption in the traffic.
>>>
>>> What? You have routes on the team netdev. virtio_net and VF are only
>>> slaves. What are you talking about? I don't get it :/
>>
>>So lets walk though this by example. The general idea of the base case
>>for all this is somebody starting with virtio_net, we will call the
>>interface "ens1" for now. It comes up and is assigned a dhcp address
>>and everything works as expected. Now in order to get better
>>performance we want to add a VF "ens2", but we don't want a new IP
>>address. Now if I understand correctly what will happen is that when
>>"ens2" appears on the system teamd will then create a new team
>>interface "team0". Before teamd can enslave ens1 it has to down the
>
> No, you don't understand that correctly.
>
> There is always ens1 and team0. ens1 is a slave of team0. team0 is the
> interface to use, to set ip on etc.
>
> When ens2 appears, it gets enslaved to team0 as well.
>
>
>>interface if I understand things correctly. This means that we have to
>>disrupt network traffic in order for this to work.
>>
>>To give you an idea of where we were before this became about trying
>>to do this in the team or bonding driver, we were debating a 2 netdev
>>model versus a 3 netdev model. I will call out the model and the
>>advantages/disadvantages of those below.
>>
>>2 Netdev model, "ens1", enslaves "ens2".
>>- Requires dropping in-driver XDP in order to work (won't capture VF
>>traffic otherwise)
>>- VF takes performance hit for extra qdisc/Tx queue lock of virtio_net interface
>>- If you ass-u-me (I haven't been a fan of this model if you can't
>>tell) that it is okay to rip out in-driver XDP from virtio_net, then
>>you could transition between base virtio, virtio w/ backup bit set.
>>- Works for netvsc because they limit their features (no in-driver
>>XDP) to guarantee this works.
>>
>>3 Netdev model, "ens1", enslaves "ens1nbackup" and "ens2"
>>- Exposes 2 netdevs "ens1" and "ens1nbackup" when only virtio is present
>>- No extra qdisc or locking
>>- All virtio_net original functionality still present
>>- Not able to transition from virtio to virtio w/ backup without
>>disruption (requires hot-plug)
>>
>>The way I see it the only way your team setup could work would be
>>something closer to the 3 netdev model. Basically we would be
>>requiring the user to always have the team0 present in order to make
>>certain that anything like XDP would be run on the team interface
>>instead of assuming that the virtio_net could run by itself. I will
>>add it as a third option here to compare to the other 2.
>
> Yes.
>
>
>>
>>3 Netdev "team" model, "team0", enslaves "ens1" and "ens2"
>>- Requires guest to configure teamd
>>- Exposes "team0" and "ens1" when only virtio is present
>>- No extra qdisc or locking
>>- Doesn't require "backup" bit in virtio
>>
>>>>
>>>>Also how does this handle any static configuration? I am assuming that
>>>>everything here assumes the team will be brought up as soon as it is
>>>>seen and assigned a DHCP address.
>>>
>>> Again. You configure whatever you need on the team netdev.
>>
>>Just so we are clear, are you then saying that the team0 interface
>>will always be present with this configuration? You had made it sound
>
> Of course.
>
>
>>like it would disappear if you didn't have at least 2 interfaces.
>
> Where did I make it sound like that? No.

I think it was a bit of misspeak/misread specifically I am thinking of:
  You don't need 2 images. You need only one. The one with the
  team setup. That's it. If another netdev with the same mac appears,
  teamd will enslave it and run traffic on it. If not, ok, you'll go only
  through virtio_net.

I read that as there being no team if the VF wasn't present since you
would still be going through team and then virtio_net otherwise.

>
>>
>>>>
>>>>The solution as you have proposed seems problematic at best. I don't
>>>>see how the team solution works without introducing some sort of
>>>>traffic disruption to either add/remove the VF and bring up/tear down
>>>>the team interface. At that point we might as well just give up on
>>>>this piece of live migration support entirely since the disruption was
>>>>what we were trying to avoid. We might as well just hotplug out the VF
>>>>and hotplug in a virtio at the same bus device and function number and
>>>>just let udev take care of renaming it for us. The idea was supposed
>>>>to be a seamless transition between the two interfaces.
>>>
>>> Alex. What you are trying to do in this patchset and what netvsc does it
>>> essentialy in-driver bonding. Same thing mechanism, rx_handler,
>>> everything. I don't really understand what are you talking about. With
>>> use of team you will get exactly the same behaviour.
>>
>>So the goal of the "in-driver bonding" is to make the bonding as
>>non-intrusive as possible and require as little user intervention as
>>possible. I agree that much of the handling is the same, however the
>>control structure and requirements are significantly different. That
>>has been what I have been trying to explain. You keep wanting to use
>>the existing structures, but they don't really apply cleanly because
>>they push control for the interface up into the guest, and that
>>doesn't make much sense in the case of virtualization. What is
>>happening here is that we are exposing a bond that the guest should
>>have no control over, or at least as little as possible. In addition
>>making the user have to add additional configuration in the guest
>>means that there is that much more that can go wrong if they screw it
>>up.
>>
>>The other problem here is that the transition needs to be as seamless
>>as possible between just a standard virtio_net setup and this new
>>setup. With either the team or bonding setup you end up essentially
>>forcing the guest to have the bond/team always there even if they are
>>running only a single interface. Only if they "upgrade" the VM by
>>adding a VF then it finally gets to do anything.
>
> Yeah. There is certainly a dilemma. We have to choose between
> 1) weird and hackish in-driver semi-bonding that would be simple
>    for user.
> 2) the standard way that would be perhaps slighly more complicated
>    for user.

The problem is for us option 2 is quite a bit uglier. Basically it
means essentially telling all the distros and such that their cloud
images have to use team by default on all virtio_net interfaces. It
pretty much means we have to throw away this as a possible solution
since you are requiring guest changes that most customers/OS vendors
would ever accept.

At least with our solution it was the driver making use of the
functionality if a given feature bit was set. The teaming solution as
proposed doesn't even give us that option.

>>
>>What this comes down to for us is the following requirements:
>>1. The name of the interface cannot change when going from virtio_net,
>>to virtio_net being bypassed using a VF. We cannot create an interface
>>on top of the interface, if anything we need to push the original
>>virtio_net out of the way so that the new team interface takes its
>>place in the configuration of the system. Otherwise a VM with VF w/
>>live migration will require a different configuration than one that
>>just runs virtio_net.
>
> Team driver netdev is still the same, no name changes.

Right. Basically we need to have the renaming occur so that any
existing config gets moved to the upper interface instead of having to
rely on configuration being adjusted for the team interface.

>>2. We need some way to signal if this VM should be running in an
>>"upgraded" mode or not. We have been using the backup bit in
>>virtio_net to do that. If it isn't "upgraded" then we don't need the
>>team/bond and we can just run with virtio_net.
>
> I don't see why the team cannot be there always.

It is more the logistical nightmare. Part of the goal here was to work
with the cloud base images that are out there such as
https://alt.fedoraproject.org/cloud/. With just the kernel changes the
overhead for this stays fairly small and would be pulled in as just a
standard part of the kernel update process. The virtio bypass only
pops up if the backup bit is present. With the team solution it
requires that the base image use the team driver on virtio_net when it
sees one. I doubt the OSVs would want to do that just because SR-IOV
isn't that popular of a case.

>>3. We cannot introduce any downtime on the interface when adding a VF
>>or removing it. The link must stay up the entire time and be able to
>>handle packets.
>
> Sure. That should be handled by the team. Whenever the VF netdev
> disappears, traffic would switch over to the virtio_net. The benefit of
> your in-driver bonding solution is that qemu can actually signal the
> guest driver that the disappearance would happen and do the switch a bit
> earlier. But that is something that might be implemented in a different
> channel where the kernel might get notification that certain pci is
> going to disappear so everyone could prepare. Just an idea.

The signaling isn't too much of an issue since we can just tweak the
link state of the VF or virtio manually to report the link up or down
prior to the hot-plug. Now that we are on the same page with the team0
interface always being there, I don't think 3 is much of a concern
with the solution as you proposed.

^ permalink raw reply

* Re: [PATCH 1/4] iommu: Add virtio-iommu driver
From: kbuild test robot @ 2018-02-21 20:12 UTC (permalink / raw)
  To: Jean-Philippe Brucker
  Cc: tnowicki, mst, will.deacon, virtualization, jintack,
	eric.auger.pro, virtio-dev, jayachandran.nair, lorenzo.pieralisi,
	kvm, joro, kvmarm, marc.zyngier, eric.auger, iommu, kbuild-all,
	robin.murphy
In-Reply-To: <20180214145340.1223-2-jean-philippe.brucker@arm.com>

[-- Attachment #1: Type: text/plain, Size: 2371 bytes --]

Hi Jean-Philippe,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.16-rc2 next-20180221]
[cannot apply to iommu/next]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Jean-Philippe-Brucker/Add-virtio-iommu-driver/20180217-075417
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm64 

All errors (new ones prefixed by >>):

   aarch64-linux-gnu-ld: arch/arm64/kernel/head.o: relocation R_AARCH64_ABS32 against `_kernel_offset_le_lo32' can not be used when making a shared object
   arch/arm64/kernel/head.o: In function `kimage_vaddr':
   (.idmap.text+0x0): dangerous relocation: unsupported relocation
   arch/arm64/kernel/head.o: In function `__primary_switch':
   (.idmap.text+0x340): dangerous relocation: unsupported relocation
   (.idmap.text+0x348): dangerous relocation: unsupported relocation
   drivers/iommu/virtio-iommu.o: In function `viommu_probe':
   virtio-iommu.c:(.text+0xbdc): undefined reference to `virtio_check_driver_offered_feature'
   virtio-iommu.c:(.text+0xcfc): undefined reference to `virtio_check_driver_offered_feature'
   virtio-iommu.c:(.text+0xe10): undefined reference to `virtio_check_driver_offered_feature'
   drivers/iommu/virtio-iommu.o: In function `viommu_send_reqs_sync':
   virtio-iommu.c:(.text+0x130c): undefined reference to `virtqueue_add_sgs'
   virtio-iommu.c:(.text+0x1398): undefined reference to `virtqueue_kick'
   virtio-iommu.c:(.text+0x14d4): undefined reference to `virtqueue_get_buf'
   drivers/iommu/virtio-iommu.o: In function `virtio_iommu_drv_init':
   virtio-iommu.c:(.init.text+0x1c): undefined reference to `register_virtio_driver'
   drivers/iommu/virtio-iommu.o: In function `virtio_iommu_drv_exit':
>> virtio-iommu.c:(.exit.text+0x14): undefined reference to `unregister_virtio_driver'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 59126 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-21 19:38 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <CAKgT0UeZS_SzNfiMX3ujy+a9WZWySVdyTpyQ=b=RJJrAG3JO=w@mail.gmail.com>

Wed, Feb 21, 2018 at 06:56:35PM CET, alexander.duyck@gmail.com wrote:
>On Wed, Feb 21, 2018 at 8:58 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Wed, Feb 21, 2018 at 05:49:49PM CET, alexander.duyck@gmail.com wrote:
>>>On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>>>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>>>>> stuck with this ugly thing forever...
>>>>>>>>
>>>>>>>> Could you at least make some common code that is shared in between
>>>>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>>>>
>>>>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>>>>Let's not make a far, far more commonly deployed and important driver
>>>>>>>(virtio) bug-compatible with netvsc.
>>>>>>
>>>>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>>>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>>>>> and make the solution based on team/bond.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>>>>user space.  I think it may very well get done in next versions of NM,
>>>>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>>>>using NM.
>>>>>>
>>>>>> Can be done in NM, networkd or other network management tools.
>>>>>> Even easier to do this in teamd and let them all benefit.
>>>>>>
>>>>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>>>>> and half.
>>>>>>
>>>>>> You can just run teamd with config option "kidnap" like this:
>>>>>> # teamd/teamd -c '{"kidnap": true }'
>>>>>>
>>>>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>>>>> or whenever teamd sees another netdev to change mac to his,
>>>>>> it enslaves it.
>>>>>>
>>>>>> Here's the patch (quick and dirty):
>>>>>>
>>>>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>>>>
>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>
>>>>>So this doesn't really address the original problem we were trying to
>>>>>solve. You asked earlier why the netdev name mattered and it mostly
>>>>>has to do with configuration. Specifically what our patch is
>>>>>attempting to resolve is the issue of how to allow a cloud provider to
>>>>>upgrade their customer to SR-IOV support and live migration without
>>>>>requiring them to reconfigure their guest. So the general idea with
>>>>>our patch is to take a VM that is running with virtio_net only and
>>>>>allow it to instead spawn a virtio_bypass master using the same netdev
>>>>>name as the original virtio, and then have the virtio_net and VF come
>>>>>up and be enslaved by the bypass interface. Doing it this way we can
>>>>>allow for multi-vendor SR-IOV live migration support using a guest
>>>>>that was originally configured for virtio only.
>>>>>
>>>>>The problem with your solution is we already have teaming and bonding
>>>>>as you said. There is already a write-up from Red Hat on how to do it
>>>>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>>>>That is all well and good as long as you are willing to keep around
>>>>>two VM images, one for virtio, and one for SR-IOV with live migration.
>>>>
>>>> You don't need 2 images. You need only one. The one with the team setup.
>>>> That's it. If another netdev with the same mac appears, teamd will
>>>> enslave it and run traffic on it. If not, ok, you'll go only through
>>>> virtio_net.
>>>
>>>Isn't that going to cause the routing table to get messed up when we
>>>rearrange the netdevs? We don't want to have an significant disruption
>>> in traffic when we are adding/removing the VF. It seems like we would
>>>need to invalidate any entries that were configured for the virtio_net
>>>and reestablish them on the new team interface. Part of the criteria
>>>we have been working with is that we should be able to transition from
>>>having a VF to not or vice versa without seeing any significant
>>>disruption in the traffic.
>>
>> What? You have routes on the team netdev. virtio_net and VF are only
>> slaves. What are you talking about? I don't get it :/
>
>So lets walk though this by example. The general idea of the base case
>for all this is somebody starting with virtio_net, we will call the
>interface "ens1" for now. It comes up and is assigned a dhcp address
>and everything works as expected. Now in order to get better
>performance we want to add a VF "ens2", but we don't want a new IP
>address. Now if I understand correctly what will happen is that when
>"ens2" appears on the system teamd will then create a new team
>interface "team0". Before teamd can enslave ens1 it has to down the

No, you don't understand that correctly.

There is always ens1 and team0. ens1 is a slave of team0. team0 is the
interface to use, to set ip on etc.

When ens2 appears, it gets enslaved to team0 as well.


>interface if I understand things correctly. This means that we have to
>disrupt network traffic in order for this to work.
>
>To give you an idea of where we were before this became about trying
>to do this in the team or bonding driver, we were debating a 2 netdev
>model versus a 3 netdev model. I will call out the model and the
>advantages/disadvantages of those below.
>
>2 Netdev model, "ens1", enslaves "ens2".
>- Requires dropping in-driver XDP in order to work (won't capture VF
>traffic otherwise)
>- VF takes performance hit for extra qdisc/Tx queue lock of virtio_net interface
>- If you ass-u-me (I haven't been a fan of this model if you can't
>tell) that it is okay to rip out in-driver XDP from virtio_net, then
>you could transition between base virtio, virtio w/ backup bit set.
>- Works for netvsc because they limit their features (no in-driver
>XDP) to guarantee this works.
>
>3 Netdev model, "ens1", enslaves "ens1nbackup" and "ens2"
>- Exposes 2 netdevs "ens1" and "ens1nbackup" when only virtio is present
>- No extra qdisc or locking
>- All virtio_net original functionality still present
>- Not able to transition from virtio to virtio w/ backup without
>disruption (requires hot-plug)
>
>The way I see it the only way your team setup could work would be
>something closer to the 3 netdev model. Basically we would be
>requiring the user to always have the team0 present in order to make
>certain that anything like XDP would be run on the team interface
>instead of assuming that the virtio_net could run by itself. I will
>add it as a third option here to compare to the other 2.

Yes.


>
>3 Netdev "team" model, "team0", enslaves "ens1" and "ens2"
>- Requires guest to configure teamd
>- Exposes "team0" and "ens1" when only virtio is present
>- No extra qdisc or locking
>- Doesn't require "backup" bit in virtio
>
>>>
>>>Also how does this handle any static configuration? I am assuming that
>>>everything here assumes the team will be brought up as soon as it is
>>>seen and assigned a DHCP address.
>>
>> Again. You configure whatever you need on the team netdev.
>
>Just so we are clear, are you then saying that the team0 interface
>will always be present with this configuration? You had made it sound

Of course.


>like it would disappear if you didn't have at least 2 interfaces.

Where did I make it sound like that? No.


>
>>>
>>>The solution as you have proposed seems problematic at best. I don't
>>>see how the team solution works without introducing some sort of
>>>traffic disruption to either add/remove the VF and bring up/tear down
>>>the team interface. At that point we might as well just give up on
>>>this piece of live migration support entirely since the disruption was
>>>what we were trying to avoid. We might as well just hotplug out the VF
>>>and hotplug in a virtio at the same bus device and function number and
>>>just let udev take care of renaming it for us. The idea was supposed
>>>to be a seamless transition between the two interfaces.
>>
>> Alex. What you are trying to do in this patchset and what netvsc does it
>> essentialy in-driver bonding. Same thing mechanism, rx_handler,
>> everything. I don't really understand what are you talking about. With
>> use of team you will get exactly the same behaviour.
>
>So the goal of the "in-driver bonding" is to make the bonding as
>non-intrusive as possible and require as little user intervention as
>possible. I agree that much of the handling is the same, however the
>control structure and requirements are significantly different. That
>has been what I have been trying to explain. You keep wanting to use
>the existing structures, but they don't really apply cleanly because
>they push control for the interface up into the guest, and that
>doesn't make much sense in the case of virtualization. What is
>happening here is that we are exposing a bond that the guest should
>have no control over, or at least as little as possible. In addition
>making the user have to add additional configuration in the guest
>means that there is that much more that can go wrong if they screw it
>up.
>
>The other problem here is that the transition needs to be as seamless
>as possible between just a standard virtio_net setup and this new
>setup. With either the team or bonding setup you end up essentially
>forcing the guest to have the bond/team always there even if they are
>running only a single interface. Only if they "upgrade" the VM by
>adding a VF then it finally gets to do anything.

Yeah. There is certainly a dilemma. We have to choose between
1) weird and hackish in-driver semi-bonding that would be simple
   for user.
2) the standard way that would be perhaps slighly more complicated
   for user.

>
>What this comes down to for us is the following requirements:
>1. The name of the interface cannot change when going from virtio_net,
>to virtio_net being bypassed using a VF. We cannot create an interface
>on top of the interface, if anything we need to push the original
>virtio_net out of the way so that the new team interface takes its
>place in the configuration of the system. Otherwise a VM with VF w/
>live migration will require a different configuration than one that
>just runs virtio_net.

Team driver netdev is still the same, no name changes.


>2. We need some way to signal if this VM should be running in an
>"upgraded" mode or not. We have been using the backup bit in
>virtio_net to do that. If it isn't "upgraded" then we don't need the
>team/bond and we can just run with virtio_net.

I don't see why the team cannot be there always.


>3. We cannot introduce any downtime on the interface when adding a VF
>or removing it. The link must stay up the entire time and be able to
>handle packets.

Sure. That should be handled by the team. Whenever the VF netdev
disappears, traffic would switch over to the virtio_net. The benefit of
your in-driver bonding solution is that qemu can actually signal the
guest driver that the disappearance would happen and do the switch a bit
earlier. But that is something that might be implemented in a different
channel where the kernel might get notification that certain pci is
going to disappear so everyone could prepare. Just an idea.

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-21 17:56 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180221165848.GD1996@nanopsycho>

On Wed, Feb 21, 2018 at 8:58 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, Feb 21, 2018 at 05:49:49PM CET, alexander.duyck@gmail.com wrote:
>>On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>>>> stuck with this ugly thing forever...
>>>>>>>
>>>>>>> Could you at least make some common code that is shared in between
>>>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>>>
>>>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>>>Let's not make a far, far more commonly deployed and important driver
>>>>>>(virtio) bug-compatible with netvsc.
>>>>>
>>>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>>>> and make the solution based on team/bond.
>>>>>
>>>>>
>>>>>>
>>>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>>>user space.  I think it may very well get done in next versions of NM,
>>>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>>>using NM.
>>>>>
>>>>> Can be done in NM, networkd or other network management tools.
>>>>> Even easier to do this in teamd and let them all benefit.
>>>>>
>>>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>>>> and half.
>>>>>
>>>>> You can just run teamd with config option "kidnap" like this:
>>>>> # teamd/teamd -c '{"kidnap": true }'
>>>>>
>>>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>>>> or whenever teamd sees another netdev to change mac to his,
>>>>> it enslaves it.
>>>>>
>>>>> Here's the patch (quick and dirty):
>>>>>
>>>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>>>
>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>>So this doesn't really address the original problem we were trying to
>>>>solve. You asked earlier why the netdev name mattered and it mostly
>>>>has to do with configuration. Specifically what our patch is
>>>>attempting to resolve is the issue of how to allow a cloud provider to
>>>>upgrade their customer to SR-IOV support and live migration without
>>>>requiring them to reconfigure their guest. So the general idea with
>>>>our patch is to take a VM that is running with virtio_net only and
>>>>allow it to instead spawn a virtio_bypass master using the same netdev
>>>>name as the original virtio, and then have the virtio_net and VF come
>>>>up and be enslaved by the bypass interface. Doing it this way we can
>>>>allow for multi-vendor SR-IOV live migration support using a guest
>>>>that was originally configured for virtio only.
>>>>
>>>>The problem with your solution is we already have teaming and bonding
>>>>as you said. There is already a write-up from Red Hat on how to do it
>>>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>>>That is all well and good as long as you are willing to keep around
>>>>two VM images, one for virtio, and one for SR-IOV with live migration.
>>>
>>> You don't need 2 images. You need only one. The one with the team setup.
>>> That's it. If another netdev with the same mac appears, teamd will
>>> enslave it and run traffic on it. If not, ok, you'll go only through
>>> virtio_net.
>>
>>Isn't that going to cause the routing table to get messed up when we
>>rearrange the netdevs? We don't want to have an significant disruption
>> in traffic when we are adding/removing the VF. It seems like we would
>>need to invalidate any entries that were configured for the virtio_net
>>and reestablish them on the new team interface. Part of the criteria
>>we have been working with is that we should be able to transition from
>>having a VF to not or vice versa without seeing any significant
>>disruption in the traffic.
>
> What? You have routes on the team netdev. virtio_net and VF are only
> slaves. What are you talking about? I don't get it :/

So lets walk though this by example. The general idea of the base case
for all this is somebody starting with virtio_net, we will call the
interface "ens1" for now. It comes up and is assigned a dhcp address
and everything works as expected. Now in order to get better
performance we want to add a VF "ens2", but we don't want a new IP
address. Now if I understand correctly what will happen is that when
"ens2" appears on the system teamd will then create a new team
interface "team0". Before teamd can enslave ens1 it has to down the
interface if I understand things correctly. This means that we have to
disrupt network traffic in order for this to work.

To give you an idea of where we were before this became about trying
to do this in the team or bonding driver, we were debating a 2 netdev
model versus a 3 netdev model. I will call out the model and the
advantages/disadvantages of those below.

2 Netdev model, "ens1", enslaves "ens2".
- Requires dropping in-driver XDP in order to work (won't capture VF
traffic otherwise)
- VF takes performance hit for extra qdisc/Tx queue lock of virtio_net interface
- If you ass-u-me (I haven't been a fan of this model if you can't
tell) that it is okay to rip out in-driver XDP from virtio_net, then
you could transition between base virtio, virtio w/ backup bit set.
- Works for netvsc because they limit their features (no in-driver
XDP) to guarantee this works.

3 Netdev model, "ens1", enslaves "ens1nbackup" and "ens2"
- Exposes 2 netdevs "ens1" and "ens1nbackup" when only virtio is present
- No extra qdisc or locking
- All virtio_net original functionality still present
- Not able to transition from virtio to virtio w/ backup without
disruption (requires hot-plug)

The way I see it the only way your team setup could work would be
something closer to the 3 netdev model. Basically we would be
requiring the user to always have the team0 present in order to make
certain that anything like XDP would be run on the team interface
instead of assuming that the virtio_net could run by itself. I will
add it as a third option here to compare to the other 2.

3 Netdev "team" model, "team0", enslaves "ens1" and "ens2"
- Requires guest to configure teamd
- Exposes "team0" and "ens1" when only virtio is present
- No extra qdisc or locking
- Doesn't require "backup" bit in virtio

>>
>>Also how does this handle any static configuration? I am assuming that
>>everything here assumes the team will be brought up as soon as it is
>>seen and assigned a DHCP address.
>
> Again. You configure whatever you need on the team netdev.

Just so we are clear, are you then saying that the team0 interface
will always be present with this configuration? You had made it sound
like it would disappear if you didn't have at least 2 interfaces.

>>
>>The solution as you have proposed seems problematic at best. I don't
>>see how the team solution works without introducing some sort of
>>traffic disruption to either add/remove the VF and bring up/tear down
>>the team interface. At that point we might as well just give up on
>>this piece of live migration support entirely since the disruption was
>>what we were trying to avoid. We might as well just hotplug out the VF
>>and hotplug in a virtio at the same bus device and function number and
>>just let udev take care of renaming it for us. The idea was supposed
>>to be a seamless transition between the two interfaces.
>
> Alex. What you are trying to do in this patchset and what netvsc does it
> essentialy in-driver bonding. Same thing mechanism, rx_handler,
> everything. I don't really understand what are you talking about. With
> use of team you will get exactly the same behaviour.

So the goal of the "in-driver bonding" is to make the bonding as
non-intrusive as possible and require as little user intervention as
possible. I agree that much of the handling is the same, however the
control structure and requirements are significantly different. That
has been what I have been trying to explain. You keep wanting to use
the existing structures, but they don't really apply cleanly because
they push control for the interface up into the guest, and that
doesn't make much sense in the case of virtualization. What is
happening here is that we are exposing a bond that the guest should
have no control over, or at least as little as possible. In addition
making the user have to add additional configuration in the guest
means that there is that much more that can go wrong if they screw it
up.

The other problem here is that the transition needs to be as seamless
as possible between just a standard virtio_net setup and this new
setup. With either the team or bonding setup you end up essentially
forcing the guest to have the bond/team always there even if they are
running only a single interface. Only if they "upgrade" the VM by
adding a VF then it finally gets to do anything.

What this comes down to for us is the following requirements:
1. The name of the interface cannot change when going from virtio_net,
to virtio_net being bypassed using a VF. We cannot create an interface
on top of the interface, if anything we need to push the original
virtio_net out of the way so that the new team interface takes its
place in the configuration of the system. Otherwise a VM with VF w/
live migration will require a different configuration than one that
just runs virtio_net.
2. We need some way to signal if this VM should be running in an
"upgraded" mode or not. We have been using the backup bit in
virtio_net to do that. If it isn't "upgraded" then we don't need the
team/bond and we can just run with virtio_net.
3. We cannot introduce any downtime on the interface when adding a VF
or removing it. The link must stay up the entire time and be able to
handle packets.

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-21 16:58 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <CAKgT0Ud+6eiY7ycyju1SToeatH1R7KBDcUOqjn9Pu2k5uOr5Wg@mail.gmail.com>

Wed, Feb 21, 2018 at 05:49:49PM CET, alexander.duyck@gmail.com wrote:
>On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>>> stuck with this ugly thing forever...
>>>>>>
>>>>>> Could you at least make some common code that is shared in between
>>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>>
>>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>>Let's not make a far, far more commonly deployed and important driver
>>>>>(virtio) bug-compatible with netvsc.
>>>>
>>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>>> and make the solution based on team/bond.
>>>>
>>>>
>>>>>
>>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>>user space.  I think it may very well get done in next versions of NM,
>>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>>using NM.
>>>>
>>>> Can be done in NM, networkd or other network management tools.
>>>> Even easier to do this in teamd and let them all benefit.
>>>>
>>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>>> and half.
>>>>
>>>> You can just run teamd with config option "kidnap" like this:
>>>> # teamd/teamd -c '{"kidnap": true }'
>>>>
>>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>>> or whenever teamd sees another netdev to change mac to his,
>>>> it enslaves it.
>>>>
>>>> Here's the patch (quick and dirty):
>>>>
>>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>>
>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>
>>>So this doesn't really address the original problem we were trying to
>>>solve. You asked earlier why the netdev name mattered and it mostly
>>>has to do with configuration. Specifically what our patch is
>>>attempting to resolve is the issue of how to allow a cloud provider to
>>>upgrade their customer to SR-IOV support and live migration without
>>>requiring them to reconfigure their guest. So the general idea with
>>>our patch is to take a VM that is running with virtio_net only and
>>>allow it to instead spawn a virtio_bypass master using the same netdev
>>>name as the original virtio, and then have the virtio_net and VF come
>>>up and be enslaved by the bypass interface. Doing it this way we can
>>>allow for multi-vendor SR-IOV live migration support using a guest
>>>that was originally configured for virtio only.
>>>
>>>The problem with your solution is we already have teaming and bonding
>>>as you said. There is already a write-up from Red Hat on how to do it
>>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>>That is all well and good as long as you are willing to keep around
>>>two VM images, one for virtio, and one for SR-IOV with live migration.
>>
>> You don't need 2 images. You need only one. The one with the team setup.
>> That's it. If another netdev with the same mac appears, teamd will
>> enslave it and run traffic on it. If not, ok, you'll go only through
>> virtio_net.
>
>Isn't that going to cause the routing table to get messed up when we
>rearrange the netdevs? We don't want to have an significant disruption
> in traffic when we are adding/removing the VF. It seems like we would
>need to invalidate any entries that were configured for the virtio_net
>and reestablish them on the new team interface. Part of the criteria
>we have been working with is that we should be able to transition from
>having a VF to not or vice versa without seeing any significant
>disruption in the traffic.

What? You have routes on the team netdev. virtio_net and VF are only
slaves. What are you talking about? I don't get it :/


>
>Also how does this handle any static configuration? I am assuming that
>everything here assumes the team will be brought up as soon as it is
>seen and assigned a DHCP address.

Again. You configure whatever you need on the team netdev.


>
>The solution as you have proposed seems problematic at best. I don't
>see how the team solution works without introducing some sort of
>traffic disruption to either add/remove the VF and bring up/tear down
>the team interface. At that point we might as well just give up on
>this piece of live migration support entirely since the disruption was
>what we were trying to avoid. We might as well just hotplug out the VF
>and hotplug in a virtio at the same bus device and function number and
>just let udev take care of renaming it for us. The idea was supposed
>to be a seamless transition between the two interfaces.

Alex. What you are trying to do in this patchset and what netvsc does it
essentialy in-driver bonding. Same thing mechanism, rx_handler,
everything. I don't really understand what are you talking about. With
use of team you will get exactly the same behaviour.

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-21 16:49 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180221161105.GC1996@nanopsycho>

On Wed, Feb 21, 2018 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>>> stuck with this ugly thing forever...
>>>>>
>>>>> Could you at least make some common code that is shared in between
>>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>>
>>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>>Let's not make a far, far more commonly deployed and important driver
>>>>(virtio) bug-compatible with netvsc.
>>>
>>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>>> it was a huge mistake to merge it. I personally would vote to unmerge it
>>> and make the solution based on team/bond.
>>>
>>>
>>>>
>>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>>user space.  I think it may very well get done in next versions of NM,
>>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>>using NM.
>>>
>>> Can be done in NM, networkd or other network management tools.
>>> Even easier to do this in teamd and let them all benefit.
>>>
>>> Actually, I took a stab to implement this in teamd. Took me like an hour
>>> and half.
>>>
>>> You can just run teamd with config option "kidnap" like this:
>>> # teamd/teamd -c '{"kidnap": true }'
>>>
>>> Whenever teamd sees another netdev to appear with the same mac as his,
>>> or whenever teamd sees another netdev to change mac to his,
>>> it enslaves it.
>>>
>>> Here's the patch (quick and dirty):
>>>
>>> Subject: [patch teamd] teamd: introduce kidnap feature
>>>
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>
>>So this doesn't really address the original problem we were trying to
>>solve. You asked earlier why the netdev name mattered and it mostly
>>has to do with configuration. Specifically what our patch is
>>attempting to resolve is the issue of how to allow a cloud provider to
>>upgrade their customer to SR-IOV support and live migration without
>>requiring them to reconfigure their guest. So the general idea with
>>our patch is to take a VM that is running with virtio_net only and
>>allow it to instead spawn a virtio_bypass master using the same netdev
>>name as the original virtio, and then have the virtio_net and VF come
>>up and be enslaved by the bypass interface. Doing it this way we can
>>allow for multi-vendor SR-IOV live migration support using a guest
>>that was originally configured for virtio only.
>>
>>The problem with your solution is we already have teaming and bonding
>>as you said. There is already a write-up from Red Hat on how to do it
>>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>>That is all well and good as long as you are willing to keep around
>>two VM images, one for virtio, and one for SR-IOV with live migration.
>
> You don't need 2 images. You need only one. The one with the team setup.
> That's it. If another netdev with the same mac appears, teamd will
> enslave it and run traffic on it. If not, ok, you'll go only through
> virtio_net.

Isn't that going to cause the routing table to get messed up when we
rearrange the netdevs? We don't want to have an significant disruption
 in traffic when we are adding/removing the VF. It seems like we would
need to invalidate any entries that were configured for the virtio_net
and reestablish them on the new team interface. Part of the criteria
we have been working with is that we should be able to transition from
having a VF to not or vice versa without seeing any significant
disruption in the traffic.

Also how does this handle any static configuration? I am assuming that
everything here assumes the team will be brought up as soon as it is
seen and assigned a DHCP address.

The solution as you have proposed seems problematic at best. I don't
see how the team solution works without introducing some sort of
traffic disruption to either add/remove the VF and bring up/tear down
the team interface. At that point we might as well just give up on
this piece of live migration support entirely since the disruption was
what we were trying to avoid. We might as well just hotplug out the VF
and hotplug in a virtio at the same bus device and function number and
just let udev take care of renaming it for us. The idea was supposed
to be a seamless transition between the two interfaces.

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-21 16:11 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <CAKgT0UdyhrVr=pYZb=AJq9sWWUVb_BadbJTcqY1AwHHTw8cmQw@mail.gmail.com>

Wed, Feb 21, 2018 at 04:56:48PM CET, alexander.duyck@gmail.com wrote:
>On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>>> stuck with this ugly thing forever...
>>>>
>>>> Could you at least make some common code that is shared in between
>>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>>
>>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>>Let's not make a far, far more commonly deployed and important driver
>>>(virtio) bug-compatible with netvsc.
>>
>> Yeah. netvsc solution is a dangerous precedent here and in my opinition
>> it was a huge mistake to merge it. I personally would vote to unmerge it
>> and make the solution based on team/bond.
>>
>>
>>>
>>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>>user space.  I think it may very well get done in next versions of NM,
>>>but isn't done yet.  Stephen also raised the point that not everybody is
>>>using NM.
>>
>> Can be done in NM, networkd or other network management tools.
>> Even easier to do this in teamd and let them all benefit.
>>
>> Actually, I took a stab to implement this in teamd. Took me like an hour
>> and half.
>>
>> You can just run teamd with config option "kidnap" like this:
>> # teamd/teamd -c '{"kidnap": true }'
>>
>> Whenever teamd sees another netdev to appear with the same mac as his,
>> or whenever teamd sees another netdev to change mac to his,
>> it enslaves it.
>>
>> Here's the patch (quick and dirty):
>>
>> Subject: [patch teamd] teamd: introduce kidnap feature
>>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>
>So this doesn't really address the original problem we were trying to
>solve. You asked earlier why the netdev name mattered and it mostly
>has to do with configuration. Specifically what our patch is
>attempting to resolve is the issue of how to allow a cloud provider to
>upgrade their customer to SR-IOV support and live migration without
>requiring them to reconfigure their guest. So the general idea with
>our patch is to take a VM that is running with virtio_net only and
>allow it to instead spawn a virtio_bypass master using the same netdev
>name as the original virtio, and then have the virtio_net and VF come
>up and be enslaved by the bypass interface. Doing it this way we can
>allow for multi-vendor SR-IOV live migration support using a guest
>that was originally configured for virtio only.
>
>The problem with your solution is we already have teaming and bonding
>as you said. There is already a write-up from Red Hat on how to do it
>(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
>That is all well and good as long as you are willing to keep around
>two VM images, one for virtio, and one for SR-IOV with live migration.

You don't need 2 images. You need only one. The one with the team setup.
That's it. If another netdev with the same mac appears, teamd will
enslave it and run traffic on it. If not, ok, you'll go only through
virtio_net.


>The problem is nobody wants to do that. What they want is to maintain
>one guest image and if they decide to upgrade to SR-IOV they still
>want their live migration and they don't want to have to reconfigure
>the guest.
>
>That said it does seem to make the existing Red Hat solution easier to
>manage since you wouldn't be guessing at ifname so I have provided
>some feedback below.
>
>> ---
>>  include/team.h             |  7 +++++++
>>  libteam/ifinfo.c           | 20 ++++++++++++++++++++
>>  teamd/teamd.c              | 17 +++++++++++++++++
>>  teamd/teamd.h              |  5 +++++
>>  teamd/teamd_events.c       | 17 +++++++++++++++++
>>  teamd/teamd_ifinfo_watch.c |  9 +++++++++
>>  teamd/teamd_per_port.c     |  7 ++++++-
>>  7 files changed, 81 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/team.h b/include/team.h
>> index 9ae517d..b0c19c8 100644
>> --- a/include/team.h
>> +++ b/include/team.h
>> @@ -137,6 +137,13 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
>>  #define team_for_each_ifinfo(ifinfo, th)                       \
>>         for (ifinfo = team_get_next_ifinfo(th, NULL); ifinfo;   \
>>              ifinfo = team_get_next_ifinfo(th, ifinfo))
>> +
>> +struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
>> +                                                 struct team_ifinfo *ifinfo);
>> +#define team_for_each_unlinked_ifinfo(ifinfo, th)                      \
>> +       for (ifinfo = team_get_next_unlinked_ifinfo(th, NULL); ifinfo;  \
>> +            ifinfo = team_get_next_unlinked_ifinfo(th, ifinfo))
>> +
>>  /* ifinfo getters */
>>  bool team_is_ifinfo_removed(struct team_ifinfo *ifinfo);
>>  uint32_t team_get_ifinfo_ifindex(struct team_ifinfo *ifinfo);
>> diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c
>> index 5c32a9c..8f9548e 100644
>> --- a/libteam/ifinfo.c
>> +++ b/libteam/ifinfo.c
>> @@ -494,6 +494,26 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
>>         return NULL;
>>  }
>>
>> +/**
>> + * @param th           libteam library context
>> + * @param ifinfo       ifinfo structure
>> + *
>> + * @details Get next unlinked ifinfo in list.
>> + *
>> + * @return Ifinfo next to ifinfo passed.
>> + **/
>> +TEAM_EXPORT
>> +struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
>> +                                                 struct team_ifinfo *ifinfo)
>> +{
>> +       do {
>> +               ifinfo = list_get_next_node_entry(&th->ifinfo_list, ifinfo, list);
>> +               if (ifinfo && !ifinfo->linked)
>> +                       return ifinfo;
>> +       } while (ifinfo);
>> +       return NULL;
>> +}
>> +
>>  /**
>>   * @param ifinfo       ifinfo structure
>>   *
>> diff --git a/teamd/teamd.c b/teamd/teamd.c
>> index aac2511..069c7f0 100644
>> --- a/teamd/teamd.c
>> +++ b/teamd/teamd.c
>> @@ -926,8 +926,25 @@ static int teamd_event_watch_port_added(struct teamd_context *ctx,
>>         return 0;
>>  }
>>
>> +static int teamd_event_watch_unlinked_hwaddr_changed(struct teamd_context *ctx,
>> +                                                    struct team_ifinfo *ifinfo,
>> +                                                    void *priv)
>> +{
>> +       int err;
>> +       bool kidnap;
>> +
>> +       err = teamd_config_bool_get(ctx, &kidnap, "$.kidnap");
>> +       if (err || !kidnap ||
>> +           ctx->hwaddr_len != team_get_ifinfo_hwaddr_len(ifinfo) ||
>> +           memcmp(team_get_ifinfo_hwaddr(ifinfo),
>> +                  ctx->hwaddr, ctx->hwaddr_len))
>> +               return 0;
>> +       return teamd_port_add(ctx, team_get_ifinfo_ifindex(ifinfo));
>> +}
>> +
>
>So I am not sure about the name of this function. It seems to imply
>that we want to capture a device if it changed its MAC address to
>match the one we are using. I suppose that works if we are making this
>a genreric thing that can run on any netdev, but our focus is virtio
>and VFs. In the grand scheme of things they shouldn't be able to
>change their MAC address in most environments that we will care about.
>That was one of the reasons why we didn't bother supporting a MAC
>change in our code since the hypervisor should have this locked and
>attempting to use a different MAC address would likely trigger the VM
>as being flagged as malicious.

This cb is called whenever mac changes, but also when netdev appears
(hwaddr changes from 00:00:00:00:00:00 and hwaddr_len from 0)


>
>>  static const struct teamd_event_watch_ops teamd_port_watch_ops = {
>>         .port_added = teamd_event_watch_port_added,
>> +       .unlinked_hwaddr_changed = teamd_event_watch_unlinked_hwaddr_changed,
>>  };
>>
>>  static int teamd_port_watch_init(struct teamd_context *ctx)
>> diff --git a/teamd/teamd.h b/teamd/teamd.h
>> index 5dbfb9b..171a8d1 100644
>> --- a/teamd/teamd.h
>> +++ b/teamd/teamd.h
>> @@ -189,6 +189,8 @@ struct teamd_event_watch_ops {
>>                                    struct teamd_port *tdport, void *priv);
>>         int (*port_ifname_changed)(struct teamd_context *ctx,
>>                                    struct teamd_port *tdport, void *priv);
>> +       int (*unlinked_hwaddr_changed)(struct teamd_context *ctx,
>> +                                      struct team_ifinfo *ifinfo, void *priv);
>>         int (*option_changed)(struct teamd_context *ctx,
>>                               struct team_option *option, void *priv);
>>         char *option_changed_match_name;
>> @@ -210,6 +212,8 @@ int teamd_event_ifinfo_ifname_changed(struct teamd_context *ctx,
>>                                       struct team_ifinfo *ifinfo);
>>  int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
>>                                            struct team_ifinfo *ifinfo);
>> +int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
>> +                                              struct team_ifinfo *ifinfo);
>>  int teamd_events_init(struct teamd_context *ctx);
>>  void teamd_events_fini(struct teamd_context *ctx);
>>  int teamd_event_watch_register(struct teamd_context *ctx,
>> @@ -313,6 +317,7 @@ static inline unsigned int teamd_port_count(struct teamd_context *ctx)
>>         return ctx->port_obj_list_count;
>>  }
>>
>> +int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex);
>>  int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name);
>>  int teamd_port_remove_ifname(struct teamd_context *ctx, const char *port_name);
>>  int teamd_port_remove_all(struct teamd_context *ctx);
>> diff --git a/teamd/teamd_events.c b/teamd/teamd_events.c
>> index 1a95974..a377090 100644
>> --- a/teamd/teamd_events.c
>> +++ b/teamd/teamd_events.c
>> @@ -184,6 +184,23 @@ int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
>>         return 0;
>>  }
>>
>> +int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
>> +                                              struct team_ifinfo *ifinfo)
>> +{
>> +       struct event_watch_item *watch;
>> +       int err;
>> +
>> +       list_for_each_node_entry(watch, &ctx->event_watch_list, list) {
>> +               if (watch->ops->unlinked_hwaddr_changed) {
>
>I would probably flip the order of this. There is no point in doing
>the loop if unlinked_hwaddr_changed is NULL. So you could probably
>check for the function pointer first and then run the loop if it is
>set.

Sure. As I said, quick and dirty patch :)


>
>> +                       err = watch->ops->unlinked_hwaddr_changed(ctx, ifinfo,
>> +                                                                 watch->priv);
>> +                       if (err)
>> +                               return err;
>> +               }
>> +       }
>> +       return 0;
>> +}
>> +
>>  int teamd_events_init(struct teamd_context *ctx)
>>  {
>>         list_init(&ctx->event_watch_list);
>> diff --git a/teamd/teamd_ifinfo_watch.c b/teamd/teamd_ifinfo_watch.c
>> index f334ff6..8d01a76 100644
>> --- a/teamd/teamd_ifinfo_watch.c
>> +++ b/teamd/teamd_ifinfo_watch.c
>> @@ -60,6 +60,15 @@ static int ifinfo_change_handler_func(struct team_handle *th, void *priv,
>>                                 return err;
>>                 }
>>         }
>> +
>> +       team_for_each_unlinked_ifinfo(ifinfo, th) {
>> +               if (team_is_ifinfo_hwaddr_changed(ifinfo) ||
>> +                   team_is_ifinfo_hwaddr_len_changed(ifinfo)) {
>> +                       err = teamd_event_unlinked_ifinfo_hwaddr_changed(ctx, ifinfo);
>> +                       if (err)
>> +                               return err;
>> +               }
>> +       }
>
>I guess this is needed for the generic case, but as I said we wouldn't
>probably need to worry about this in the VF/virtio case as the VM is
>probably locked to a specific MAC address.
>
>Also I am not sure about this bit. It seems like this is only checking
>for the HW addr being changed. Is that bit set if a new interface is
>registered? I haven't worked on teamd so I am not familiar with how it
>handles new interfaces. Also how does this handle existing interfaces
>that were registered before you started this?

See my reply above.


>
>>         return 0;
>>  }
>>
>> diff --git a/teamd/teamd_per_port.c b/teamd/teamd_per_port.c
>> index 09d1dc7..21e1bda 100644
>> --- a/teamd/teamd_per_port.c
>> +++ b/teamd/teamd_per_port.c
>> @@ -331,6 +331,11 @@ next_one:
>>         return tdport;
>>  }
>>
>> +int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex)
>> +{
>> +       return team_port_add(ctx->th, ifindex);
>> +}
>> +
>>  int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
>>  {
>>         uint32_t ifindex;
>> @@ -338,7 +343,7 @@ int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
>>         ifindex = team_ifname2ifindex(ctx->th, port_name);
>>         teamd_log_dbg("%s: Adding port (found ifindex \"%d\").",
>>                       port_name, ifindex);
>> -       return team_port_add(ctx->th, ifindex);
>> +       return teamd_port_add(ctx, ifindex);
>>  }
>>
>>  static int teamd_port_remove(struct teamd_context *ctx,

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-21 15:56 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180221095159.GA1996@nanopsycho>

On Wed, Feb 21, 2018 at 1:51 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>>> stuck with this ugly thing forever...
>>>
>>> Could you at least make some common code that is shared in between
>>> netvsc and virtio_net so this is handled in exacly the same way in both?
>>
>>IMHO netvsc is a vendor specific driver which made a mistake on what
>>behaviour it provides (or tried to align itself with Windows SR-IOV).
>>Let's not make a far, far more commonly deployed and important driver
>>(virtio) bug-compatible with netvsc.
>
> Yeah. netvsc solution is a dangerous precedent here and in my opinition
> it was a huge mistake to merge it. I personally would vote to unmerge it
> and make the solution based on team/bond.
>
>
>>
>>To Jiri's initial comments, I feel the same way, in fact I've talked to
>>the NetworkManager guys to get auto-bonding based on MACs handled in
>>user space.  I think it may very well get done in next versions of NM,
>>but isn't done yet.  Stephen also raised the point that not everybody is
>>using NM.
>
> Can be done in NM, networkd or other network management tools.
> Even easier to do this in teamd and let them all benefit.
>
> Actually, I took a stab to implement this in teamd. Took me like an hour
> and half.
>
> You can just run teamd with config option "kidnap" like this:
> # teamd/teamd -c '{"kidnap": true }'
>
> Whenever teamd sees another netdev to appear with the same mac as his,
> or whenever teamd sees another netdev to change mac to his,
> it enslaves it.
>
> Here's the patch (quick and dirty):
>
> Subject: [patch teamd] teamd: introduce kidnap feature
>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>

So this doesn't really address the original problem we were trying to
solve. You asked earlier why the netdev name mattered and it mostly
has to do with configuration. Specifically what our patch is
attempting to resolve is the issue of how to allow a cloud provider to
upgrade their customer to SR-IOV support and live migration without
requiring them to reconfigure their guest. So the general idea with
our patch is to take a VM that is running with virtio_net only and
allow it to instead spawn a virtio_bypass master using the same netdev
name as the original virtio, and then have the virtio_net and VF come
up and be enslaved by the bypass interface. Doing it this way we can
allow for multi-vendor SR-IOV live migration support using a guest
that was originally configured for virtio only.

The problem with your solution is we already have teaming and bonding
as you said. There is already a write-up from Red Hat on how to do it
(https://access.redhat.com/documentation/en-us/red_hat_virtualization/4.1/html/virtual_machine_management_guide/sect-migrating_virtual_machines_between_hosts).
That is all well and good as long as you are willing to keep around
two VM images, one for virtio, and one for SR-IOV with live migration.
The problem is nobody wants to do that. What they want is to maintain
one guest image and if they decide to upgrade to SR-IOV they still
want their live migration and they don't want to have to reconfigure
the guest.

That said it does seem to make the existing Red Hat solution easier to
manage since you wouldn't be guessing at ifname so I have provided
some feedback below.

> ---
>  include/team.h             |  7 +++++++
>  libteam/ifinfo.c           | 20 ++++++++++++++++++++
>  teamd/teamd.c              | 17 +++++++++++++++++
>  teamd/teamd.h              |  5 +++++
>  teamd/teamd_events.c       | 17 +++++++++++++++++
>  teamd/teamd_ifinfo_watch.c |  9 +++++++++
>  teamd/teamd_per_port.c     |  7 ++++++-
>  7 files changed, 81 insertions(+), 1 deletion(-)
>
> diff --git a/include/team.h b/include/team.h
> index 9ae517d..b0c19c8 100644
> --- a/include/team.h
> +++ b/include/team.h
> @@ -137,6 +137,13 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
>  #define team_for_each_ifinfo(ifinfo, th)                       \
>         for (ifinfo = team_get_next_ifinfo(th, NULL); ifinfo;   \
>              ifinfo = team_get_next_ifinfo(th, ifinfo))
> +
> +struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
> +                                                 struct team_ifinfo *ifinfo);
> +#define team_for_each_unlinked_ifinfo(ifinfo, th)                      \
> +       for (ifinfo = team_get_next_unlinked_ifinfo(th, NULL); ifinfo;  \
> +            ifinfo = team_get_next_unlinked_ifinfo(th, ifinfo))
> +
>  /* ifinfo getters */
>  bool team_is_ifinfo_removed(struct team_ifinfo *ifinfo);
>  uint32_t team_get_ifinfo_ifindex(struct team_ifinfo *ifinfo);
> diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c
> index 5c32a9c..8f9548e 100644
> --- a/libteam/ifinfo.c
> +++ b/libteam/ifinfo.c
> @@ -494,6 +494,26 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
>         return NULL;
>  }
>
> +/**
> + * @param th           libteam library context
> + * @param ifinfo       ifinfo structure
> + *
> + * @details Get next unlinked ifinfo in list.
> + *
> + * @return Ifinfo next to ifinfo passed.
> + **/
> +TEAM_EXPORT
> +struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
> +                                                 struct team_ifinfo *ifinfo)
> +{
> +       do {
> +               ifinfo = list_get_next_node_entry(&th->ifinfo_list, ifinfo, list);
> +               if (ifinfo && !ifinfo->linked)
> +                       return ifinfo;
> +       } while (ifinfo);
> +       return NULL;
> +}
> +
>  /**
>   * @param ifinfo       ifinfo structure
>   *
> diff --git a/teamd/teamd.c b/teamd/teamd.c
> index aac2511..069c7f0 100644
> --- a/teamd/teamd.c
> +++ b/teamd/teamd.c
> @@ -926,8 +926,25 @@ static int teamd_event_watch_port_added(struct teamd_context *ctx,
>         return 0;
>  }
>
> +static int teamd_event_watch_unlinked_hwaddr_changed(struct teamd_context *ctx,
> +                                                    struct team_ifinfo *ifinfo,
> +                                                    void *priv)
> +{
> +       int err;
> +       bool kidnap;
> +
> +       err = teamd_config_bool_get(ctx, &kidnap, "$.kidnap");
> +       if (err || !kidnap ||
> +           ctx->hwaddr_len != team_get_ifinfo_hwaddr_len(ifinfo) ||
> +           memcmp(team_get_ifinfo_hwaddr(ifinfo),
> +                  ctx->hwaddr, ctx->hwaddr_len))
> +               return 0;
> +       return teamd_port_add(ctx, team_get_ifinfo_ifindex(ifinfo));
> +}
> +

So I am not sure about the name of this function. It seems to imply
that we want to capture a device if it changed its MAC address to
match the one we are using. I suppose that works if we are making this
a genreric thing that can run on any netdev, but our focus is virtio
and VFs. In the grand scheme of things they shouldn't be able to
change their MAC address in most environments that we will care about.
That was one of the reasons why we didn't bother supporting a MAC
change in our code since the hypervisor should have this locked and
attempting to use a different MAC address would likely trigger the VM
as being flagged as malicious.

>  static const struct teamd_event_watch_ops teamd_port_watch_ops = {
>         .port_added = teamd_event_watch_port_added,
> +       .unlinked_hwaddr_changed = teamd_event_watch_unlinked_hwaddr_changed,
>  };
>
>  static int teamd_port_watch_init(struct teamd_context *ctx)
> diff --git a/teamd/teamd.h b/teamd/teamd.h
> index 5dbfb9b..171a8d1 100644
> --- a/teamd/teamd.h
> +++ b/teamd/teamd.h
> @@ -189,6 +189,8 @@ struct teamd_event_watch_ops {
>                                    struct teamd_port *tdport, void *priv);
>         int (*port_ifname_changed)(struct teamd_context *ctx,
>                                    struct teamd_port *tdport, void *priv);
> +       int (*unlinked_hwaddr_changed)(struct teamd_context *ctx,
> +                                      struct team_ifinfo *ifinfo, void *priv);
>         int (*option_changed)(struct teamd_context *ctx,
>                               struct team_option *option, void *priv);
>         char *option_changed_match_name;
> @@ -210,6 +212,8 @@ int teamd_event_ifinfo_ifname_changed(struct teamd_context *ctx,
>                                       struct team_ifinfo *ifinfo);
>  int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
>                                            struct team_ifinfo *ifinfo);
> +int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
> +                                              struct team_ifinfo *ifinfo);
>  int teamd_events_init(struct teamd_context *ctx);
>  void teamd_events_fini(struct teamd_context *ctx);
>  int teamd_event_watch_register(struct teamd_context *ctx,
> @@ -313,6 +317,7 @@ static inline unsigned int teamd_port_count(struct teamd_context *ctx)
>         return ctx->port_obj_list_count;
>  }
>
> +int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex);
>  int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name);
>  int teamd_port_remove_ifname(struct teamd_context *ctx, const char *port_name);
>  int teamd_port_remove_all(struct teamd_context *ctx);
> diff --git a/teamd/teamd_events.c b/teamd/teamd_events.c
> index 1a95974..a377090 100644
> --- a/teamd/teamd_events.c
> +++ b/teamd/teamd_events.c
> @@ -184,6 +184,23 @@ int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
>         return 0;
>  }
>
> +int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
> +                                              struct team_ifinfo *ifinfo)
> +{
> +       struct event_watch_item *watch;
> +       int err;
> +
> +       list_for_each_node_entry(watch, &ctx->event_watch_list, list) {
> +               if (watch->ops->unlinked_hwaddr_changed) {

I would probably flip the order of this. There is no point in doing
the loop if unlinked_hwaddr_changed is NULL. So you could probably
check for the function pointer first and then run the loop if it is
set.

> +                       err = watch->ops->unlinked_hwaddr_changed(ctx, ifinfo,
> +                                                                 watch->priv);
> +                       if (err)
> +                               return err;
> +               }
> +       }
> +       return 0;
> +}
> +
>  int teamd_events_init(struct teamd_context *ctx)
>  {
>         list_init(&ctx->event_watch_list);
> diff --git a/teamd/teamd_ifinfo_watch.c b/teamd/teamd_ifinfo_watch.c
> index f334ff6..8d01a76 100644
> --- a/teamd/teamd_ifinfo_watch.c
> +++ b/teamd/teamd_ifinfo_watch.c
> @@ -60,6 +60,15 @@ static int ifinfo_change_handler_func(struct team_handle *th, void *priv,
>                                 return err;
>                 }
>         }
> +
> +       team_for_each_unlinked_ifinfo(ifinfo, th) {
> +               if (team_is_ifinfo_hwaddr_changed(ifinfo) ||
> +                   team_is_ifinfo_hwaddr_len_changed(ifinfo)) {
> +                       err = teamd_event_unlinked_ifinfo_hwaddr_changed(ctx, ifinfo);
> +                       if (err)
> +                               return err;
> +               }
> +       }

I guess this is needed for the generic case, but as I said we wouldn't
probably need to worry about this in the VF/virtio case as the VM is
probably locked to a specific MAC address.

Also I am not sure about this bit. It seems like this is only checking
for the HW addr being changed. Is that bit set if a new interface is
registered? I haven't worked on teamd so I am not familiar with how it
handles new interfaces. Also how does this handle existing interfaces
that were registered before you started this?

>         return 0;
>  }
>
> diff --git a/teamd/teamd_per_port.c b/teamd/teamd_per_port.c
> index 09d1dc7..21e1bda 100644
> --- a/teamd/teamd_per_port.c
> +++ b/teamd/teamd_per_port.c
> @@ -331,6 +331,11 @@ next_one:
>         return tdport;
>  }
>
> +int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex)
> +{
> +       return team_port_add(ctx->th, ifindex);
> +}
> +
>  int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
>  {
>         uint32_t ifindex;
> @@ -338,7 +343,7 @@ int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
>         ifindex = team_ifname2ifindex(ctx->th, port_name);
>         teamd_log_dbg("%s: Adding port (found ifindex \"%d\").",
>                       port_name, ifindex);
> -       return team_port_add(ctx->th, ifindex);
> +       return teamd_port_add(ctx, ifindex);
>  }
>
>  static int teamd_port_remove(struct teamd_context *ctx,

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-21  9:51 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Samudrala, Sridhar, Alexander Duyck, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180220143356.3467084d@cakuba.netronome.com>

Tue, Feb 20, 2018 at 11:33:56PM CET, kubakici@wp.pl wrote:
>On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
>> Yeah, I can see it now :( I guess that the ship has sailed and we are
>> stuck with this ugly thing forever...
>> 
>> Could you at least make some common code that is shared in between
>> netvsc and virtio_net so this is handled in exacly the same way in both?
>
>IMHO netvsc is a vendor specific driver which made a mistake on what
>behaviour it provides (or tried to align itself with Windows SR-IOV).
>Let's not make a far, far more commonly deployed and important driver
>(virtio) bug-compatible with netvsc.

Yeah. netvsc solution is a dangerous precedent here and in my opinition
it was a huge mistake to merge it. I personally would vote to unmerge it
and make the solution based on team/bond.


>
>To Jiri's initial comments, I feel the same way, in fact I've talked to
>the NetworkManager guys to get auto-bonding based on MACs handled in
>user space.  I think it may very well get done in next versions of NM,
>but isn't done yet.  Stephen also raised the point that not everybody is
>using NM.

Can be done in NM, networkd or other network management tools.
Even easier to do this in teamd and let them all benefit.

Actually, I took a stab to implement this in teamd. Took me like an hour
and half.

You can just run teamd with config option "kidnap" like this:
# teamd/teamd -c '{"kidnap": true }'

Whenever teamd sees another netdev to appear with the same mac as his,
or whenever teamd sees another netdev to change mac to his,
it enslaves it.

Here's the patch (quick and dirty):

Subject: [patch teamd] teamd: introduce kidnap feature

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/team.h             |  7 +++++++
 libteam/ifinfo.c           | 20 ++++++++++++++++++++
 teamd/teamd.c              | 17 +++++++++++++++++
 teamd/teamd.h              |  5 +++++
 teamd/teamd_events.c       | 17 +++++++++++++++++
 teamd/teamd_ifinfo_watch.c |  9 +++++++++
 teamd/teamd_per_port.c     |  7 ++++++-
 7 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/include/team.h b/include/team.h
index 9ae517d..b0c19c8 100644
--- a/include/team.h
+++ b/include/team.h
@@ -137,6 +137,13 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
 #define team_for_each_ifinfo(ifinfo, th)			\
 	for (ifinfo = team_get_next_ifinfo(th, NULL); ifinfo;	\
 	     ifinfo = team_get_next_ifinfo(th, ifinfo))
+
+struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
+						  struct team_ifinfo *ifinfo);
+#define team_for_each_unlinked_ifinfo(ifinfo, th)			\
+	for (ifinfo = team_get_next_unlinked_ifinfo(th, NULL); ifinfo;	\
+	     ifinfo = team_get_next_unlinked_ifinfo(th, ifinfo))
+
 /* ifinfo getters */
 bool team_is_ifinfo_removed(struct team_ifinfo *ifinfo);
 uint32_t team_get_ifinfo_ifindex(struct team_ifinfo *ifinfo);
diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c
index 5c32a9c..8f9548e 100644
--- a/libteam/ifinfo.c
+++ b/libteam/ifinfo.c
@@ -494,6 +494,26 @@ struct team_ifinfo *team_get_next_ifinfo(struct team_handle *th,
 	return NULL;
 }
 
+/**
+ * @param th		libteam library context
+ * @param ifinfo	ifinfo structure
+ *
+ * @details Get next unlinked ifinfo in list.
+ *
+ * @return Ifinfo next to ifinfo passed.
+ **/
+TEAM_EXPORT
+struct team_ifinfo *team_get_next_unlinked_ifinfo(struct team_handle *th,
+						  struct team_ifinfo *ifinfo)
+{
+	do {
+		ifinfo = list_get_next_node_entry(&th->ifinfo_list, ifinfo, list);
+		if (ifinfo && !ifinfo->linked)
+			return ifinfo;
+	} while (ifinfo);
+	return NULL;
+}
+
 /**
  * @param ifinfo	ifinfo structure
  *
diff --git a/teamd/teamd.c b/teamd/teamd.c
index aac2511..069c7f0 100644
--- a/teamd/teamd.c
+++ b/teamd/teamd.c
@@ -926,8 +926,25 @@ static int teamd_event_watch_port_added(struct teamd_context *ctx,
 	return 0;
 }
 
+static int teamd_event_watch_unlinked_hwaddr_changed(struct teamd_context *ctx,
+						     struct team_ifinfo *ifinfo,
+						     void *priv)
+{
+	int err;
+	bool kidnap;
+
+	err = teamd_config_bool_get(ctx, &kidnap, "$.kidnap");
+	if (err || !kidnap ||
+	    ctx->hwaddr_len != team_get_ifinfo_hwaddr_len(ifinfo) ||
+	    memcmp(team_get_ifinfo_hwaddr(ifinfo),
+		   ctx->hwaddr, ctx->hwaddr_len))
+		return 0;
+	return teamd_port_add(ctx, team_get_ifinfo_ifindex(ifinfo));
+}
+
 static const struct teamd_event_watch_ops teamd_port_watch_ops = {
 	.port_added = teamd_event_watch_port_added,
+	.unlinked_hwaddr_changed = teamd_event_watch_unlinked_hwaddr_changed,
 };
 
 static int teamd_port_watch_init(struct teamd_context *ctx)
diff --git a/teamd/teamd.h b/teamd/teamd.h
index 5dbfb9b..171a8d1 100644
--- a/teamd/teamd.h
+++ b/teamd/teamd.h
@@ -189,6 +189,8 @@ struct teamd_event_watch_ops {
 				   struct teamd_port *tdport, void *priv);
 	int (*port_ifname_changed)(struct teamd_context *ctx,
 				   struct teamd_port *tdport, void *priv);
+	int (*unlinked_hwaddr_changed)(struct teamd_context *ctx,
+				       struct team_ifinfo *ifinfo, void *priv);
 	int (*option_changed)(struct teamd_context *ctx,
 			      struct team_option *option, void *priv);
 	char *option_changed_match_name;
@@ -210,6 +212,8 @@ int teamd_event_ifinfo_ifname_changed(struct teamd_context *ctx,
 				      struct team_ifinfo *ifinfo);
 int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
 					   struct team_ifinfo *ifinfo);
+int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
+					       struct team_ifinfo *ifinfo);
 int teamd_events_init(struct teamd_context *ctx);
 void teamd_events_fini(struct teamd_context *ctx);
 int teamd_event_watch_register(struct teamd_context *ctx,
@@ -313,6 +317,7 @@ static inline unsigned int teamd_port_count(struct teamd_context *ctx)
 	return ctx->port_obj_list_count;
 }
 
+int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex);
 int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name);
 int teamd_port_remove_ifname(struct teamd_context *ctx, const char *port_name);
 int teamd_port_remove_all(struct teamd_context *ctx);
diff --git a/teamd/teamd_events.c b/teamd/teamd_events.c
index 1a95974..a377090 100644
--- a/teamd/teamd_events.c
+++ b/teamd/teamd_events.c
@@ -184,6 +184,23 @@ int teamd_event_ifinfo_admin_state_changed(struct teamd_context *ctx,
 	return 0;
 }
 
+int teamd_event_unlinked_ifinfo_hwaddr_changed(struct teamd_context *ctx,
+					       struct team_ifinfo *ifinfo)
+{
+	struct event_watch_item *watch;
+	int err;
+
+	list_for_each_node_entry(watch, &ctx->event_watch_list, list) {
+		if (watch->ops->unlinked_hwaddr_changed) {
+			err = watch->ops->unlinked_hwaddr_changed(ctx, ifinfo,
+								  watch->priv);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
 int teamd_events_init(struct teamd_context *ctx)
 {
 	list_init(&ctx->event_watch_list);
diff --git a/teamd/teamd_ifinfo_watch.c b/teamd/teamd_ifinfo_watch.c
index f334ff6..8d01a76 100644
--- a/teamd/teamd_ifinfo_watch.c
+++ b/teamd/teamd_ifinfo_watch.c
@@ -60,6 +60,15 @@ static int ifinfo_change_handler_func(struct team_handle *th, void *priv,
 				return err;
 		}
 	}
+
+	team_for_each_unlinked_ifinfo(ifinfo, th) {
+		if (team_is_ifinfo_hwaddr_changed(ifinfo) ||
+		    team_is_ifinfo_hwaddr_len_changed(ifinfo)) {
+			err = teamd_event_unlinked_ifinfo_hwaddr_changed(ctx, ifinfo);
+			if (err)
+				return err;
+		}
+	}
 	return 0;
 }
 
diff --git a/teamd/teamd_per_port.c b/teamd/teamd_per_port.c
index 09d1dc7..21e1bda 100644
--- a/teamd/teamd_per_port.c
+++ b/teamd/teamd_per_port.c
@@ -331,6 +331,11 @@ next_one:
 	return tdport;
 }
 
+int teamd_port_add(struct teamd_context *ctx, uint32_t ifindex)
+{
+	return team_port_add(ctx->th, ifindex);
+}
+
 int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
 {
 	uint32_t ifindex;
@@ -338,7 +343,7 @@ int teamd_port_add_ifname(struct teamd_context *ctx, const char *port_name)
 	ifindex = team_ifname2ifindex(ctx->th, port_name);
 	teamd_log_dbg("%s: Adding port (found ifindex \"%d\").",
 		      port_name, ifindex);
-	return team_port_add(ctx->th, ifindex);
+	return teamd_port_add(ctx, ifindex);
 }
 
 static int teamd_port_remove(struct teamd_context *ctx,
-- 
2.14.3

^ permalink raw reply related

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jakub Kicinski @ 2018-02-20 22:33 UTC (permalink / raw)
  To: Jiri Pirko, Samudrala, Sridhar, Alexander Duyck
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin, Netdev,
	virtualization, Siwei Liu, David Miller
In-Reply-To: <20180220201410.GF2031@nanopsycho>

On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
> Yeah, I can see it now :( I guess that the ship has sailed and we are
> stuck with this ugly thing forever...
> 
> Could you at least make some common code that is shared in between
> netvsc and virtio_net so this is handled in exacly the same way in both?

IMHO netvsc is a vendor specific driver which made a mistake on what
behaviour it provides (or tried to align itself with Windows SR-IOV).
Let's not make a far, far more commonly deployed and important driver
(virtio) bug-compatible with netvsc.

To Jiri's initial comments, I feel the same way, in fact I've talked to
the NetworkManager guys to get auto-bonding based on MACs handled in
user space.  I think it may very well get done in next versions of NM,
but isn't done yet.  Stephen also raised the point that not everybody is
using NM.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox