Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v2 4/6] crypto: virtio: convert to new crypto engine API
From: Corentin Labbe @ 2018-01-26 19:15 UTC (permalink / raw)
  To: alexandre.torgue, arei.gonglei, corbet, davem, herbert, jasowang,
	mcoquelin.stm32, mst, fabien.dessenne
  Cc: linux-doc, linux-kernel, virtualization, linux-sunxi,
	Corentin Labbe, linux-crypto, linux-arm-kernel
In-Reply-To: <20180126191534.17569-1-clabbe.montjoie@gmail.com>

This patch convert the driver to the new crypto engine API.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
---
 drivers/crypto/virtio/virtio_crypto_algs.c   | 16 ++++++++++------
 drivers/crypto/virtio/virtio_crypto_common.h |  3 +--
 drivers/crypto/virtio/virtio_crypto_core.c   |  3 ---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/crypto/virtio/virtio_crypto_algs.c b/drivers/crypto/virtio/virtio_crypto_algs.c
index abe8c15450df..ba190cfa7aa1 100644
--- a/drivers/crypto/virtio/virtio_crypto_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_algs.c
@@ -29,6 +29,7 @@
 
 
 struct virtio_crypto_ablkcipher_ctx {
+	struct crypto_engine_ctx enginectx;
 	struct virtio_crypto *vcrypto;
 	struct crypto_tfm *tfm;
 
@@ -491,7 +492,7 @@ static int virtio_crypto_ablkcipher_encrypt(struct ablkcipher_request *req)
 	vc_sym_req->ablkcipher_req = req;
 	vc_sym_req->encrypt = true;
 
-	return crypto_transfer_cipher_request_to_engine(data_vq->engine, req);
+	return crypto_transfer_ablkcipher_request_to_engine(data_vq->engine, req);
 }
 
 static int virtio_crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
@@ -511,7 +512,7 @@ static int virtio_crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
 	vc_sym_req->ablkcipher_req = req;
 	vc_sym_req->encrypt = false;
 
-	return crypto_transfer_cipher_request_to_engine(data_vq->engine, req);
+	return crypto_transfer_ablkcipher_request_to_engine(data_vq->engine, req);
 }
 
 static int virtio_crypto_ablkcipher_init(struct crypto_tfm *tfm)
@@ -521,6 +522,9 @@ static int virtio_crypto_ablkcipher_init(struct crypto_tfm *tfm)
 	tfm->crt_ablkcipher.reqsize = sizeof(struct virtio_crypto_sym_request);
 	ctx->tfm = tfm;
 
+	ctx->enginectx.op.do_one_request = virtio_crypto_ablkcipher_crypt_req;
+	ctx->enginectx.op.prepare_request = NULL;
+	ctx->enginectx.op.unprepare_request = NULL;
 	return 0;
 }
 
@@ -538,9 +542,9 @@ static void virtio_crypto_ablkcipher_exit(struct crypto_tfm *tfm)
 }
 
 int virtio_crypto_ablkcipher_crypt_req(
-	struct crypto_engine *engine,
-	struct ablkcipher_request *req)
+	struct crypto_engine *engine, void *vreq)
 {
+	struct ablkcipher_request *req = container_of(vreq, struct ablkcipher_request, base);
 	struct virtio_crypto_sym_request *vc_sym_req =
 				ablkcipher_request_ctx(req);
 	struct virtio_crypto_request *vc_req = &vc_sym_req->base;
@@ -561,8 +565,8 @@ static void virtio_crypto_ablkcipher_finalize_req(
 	struct ablkcipher_request *req,
 	int err)
 {
-	crypto_finalize_cipher_request(vc_sym_req->base.dataq->engine,
-					req, err);
+	crypto_finalize_ablkcipher_request(vc_sym_req->base.dataq->engine,
+					   req, err);
 	kzfree(vc_sym_req->iv);
 	virtcrypto_clear_request(&vc_sym_req->base);
 }
diff --git a/drivers/crypto/virtio/virtio_crypto_common.h b/drivers/crypto/virtio/virtio_crypto_common.h
index e976539a05d9..72621bd67211 100644
--- a/drivers/crypto/virtio/virtio_crypto_common.h
+++ b/drivers/crypto/virtio/virtio_crypto_common.h
@@ -107,8 +107,7 @@ struct virtio_crypto *virtcrypto_get_dev_node(int node);
 int virtcrypto_dev_start(struct virtio_crypto *vcrypto);
 void virtcrypto_dev_stop(struct virtio_crypto *vcrypto);
 int virtio_crypto_ablkcipher_crypt_req(
-	struct crypto_engine *engine,
-	struct ablkcipher_request *req);
+	struct crypto_engine *engine, void *vreq);
 
 void
 virtcrypto_clear_request(struct virtio_crypto_request *vc_req);
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index ff1410a32c2b..83326986c113 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -111,9 +111,6 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 			ret = -ENOMEM;
 			goto err_engine;
 		}
-
-		vi->data_vq[i].engine->cipher_one_request =
-			virtio_crypto_ablkcipher_crypt_req;
 	}
 
 	kfree(names);
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 5/6] crypto: stm32-hash: convert to the new crypto engine API
From: Corentin Labbe @ 2018-01-26 19:15 UTC (permalink / raw)
  To: alexandre.torgue, arei.gonglei, corbet, davem, herbert, jasowang,
	mcoquelin.stm32, mst, fabien.dessenne
  Cc: linux-doc, linux-kernel, virtualization, linux-sunxi,
	Corentin Labbe, linux-crypto, linux-arm-kernel
In-Reply-To: <20180126191534.17569-1-clabbe.montjoie@gmail.com>

This patch convert the stm32-hash driver to the new crypto engine API.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Tested-by: Fabien Dessenne <fabien.dessenne@st.com>
---
 drivers/crypto/stm32/stm32-hash.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c
index 4ca4a264a833..89b0c2490d80 100644
--- a/drivers/crypto/stm32/stm32-hash.c
+++ b/drivers/crypto/stm32/stm32-hash.c
@@ -122,6 +122,7 @@ enum stm32_hash_data_format {
 #define HASH_DMA_THRESHOLD		50
 
 struct stm32_hash_ctx {
+	struct crypto_engine_ctx enginectx;
 	struct stm32_hash_dev	*hdev;
 	unsigned long		flags;
 
@@ -828,15 +829,19 @@ static int stm32_hash_hw_init(struct stm32_hash_dev *hdev,
 	return 0;
 }
 
+static int stm32_hash_one_request(struct crypto_engine *engine, void *areq);
+static int stm32_hash_prepare_req(struct crypto_engine *engine, void *areq);
+
 static int stm32_hash_handle_queue(struct stm32_hash_dev *hdev,
 				   struct ahash_request *req)
 {
 	return crypto_transfer_hash_request_to_engine(hdev->engine, req);
 }
 
-static int stm32_hash_prepare_req(struct crypto_engine *engine,
-				  struct ahash_request *req)
+static int stm32_hash_prepare_req(struct crypto_engine *engine, void *areq)
 {
+	struct ahash_request *req = container_of(areq, struct ahash_request,
+						 base);
 	struct stm32_hash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
 	struct stm32_hash_dev *hdev = stm32_hash_find_dev(ctx);
 	struct stm32_hash_request_ctx *rctx;
@@ -854,9 +859,10 @@ static int stm32_hash_prepare_req(struct crypto_engine *engine,
 	return stm32_hash_hw_init(hdev, rctx);
 }
 
-static int stm32_hash_one_request(struct crypto_engine *engine,
-				  struct ahash_request *req)
+static int stm32_hash_one_request(struct crypto_engine *engine, void *areq)
 {
+	struct ahash_request *req = container_of(areq, struct ahash_request,
+						 base);
 	struct stm32_hash_ctx *ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
 	struct stm32_hash_dev *hdev = stm32_hash_find_dev(ctx);
 	struct stm32_hash_request_ctx *rctx;
@@ -1033,6 +1039,9 @@ static int stm32_hash_cra_init_algs(struct crypto_tfm *tfm,
 	if (algs_hmac_name)
 		ctx->flags |= HASH_FLAGS_HMAC;
 
+	ctx->enginectx.op.do_one_request = stm32_hash_one_request;
+	ctx->enginectx.op.prepare_request = stm32_hash_prepare_req;
+	ctx->enginectx.op.unprepare_request = NULL;
 	return 0;
 }
 
@@ -1493,9 +1502,6 @@ static int stm32_hash_probe(struct platform_device *pdev)
 		goto err_engine;
 	}
 
-	hdev->engine->prepare_hash_request = stm32_hash_prepare_req;
-	hdev->engine->hash_one_request = stm32_hash_one_request;
-
 	ret = crypto_engine_start(hdev->engine);
 	if (ret)
 		goto err_engine_start;
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 6/6] crypto: stm32-cryp: convert to the new crypto engine API
From: Corentin Labbe @ 2018-01-26 19:15 UTC (permalink / raw)
  To: alexandre.torgue, arei.gonglei, corbet, davem, herbert, jasowang,
	mcoquelin.stm32, mst, fabien.dessenne
  Cc: linux-doc, linux-kernel, virtualization, linux-sunxi,
	Corentin Labbe, linux-crypto, linux-arm-kernel
In-Reply-To: <20180126191534.17569-1-clabbe.montjoie@gmail.com>

This patch convert the stm32-cryp driver to the new crypto engine API.
Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Tested-by: Fabien Dessenne <fabien.dessenne@st.com>
---
 drivers/crypto/stm32/stm32-cryp.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c
index cf1dddbeaa2c..a816b2ffcaad 100644
--- a/drivers/crypto/stm32/stm32-cryp.c
+++ b/drivers/crypto/stm32/stm32-cryp.c
@@ -91,6 +91,7 @@
 #define _walked_out             (cryp->out_walk.offset - cryp->out_sg->offset)
 
 struct stm32_cryp_ctx {
+	struct crypto_engine_ctx enginectx;
 	struct stm32_cryp       *cryp;
 	int                     keylen;
 	u32                     key[AES_KEYSIZE_256 / sizeof(u32)];
@@ -478,7 +479,7 @@ static void stm32_cryp_finish_req(struct stm32_cryp *cryp)
 		free_pages((unsigned long)buf_out, pages);
 	}
 
-	crypto_finalize_cipher_request(cryp->engine, cryp->req, err);
+	crypto_finalize_ablkcipher_request(cryp->engine, cryp->req, err);
 	cryp->req = NULL;
 
 	memset(cryp->ctx->key, 0, cryp->ctx->keylen);
@@ -494,10 +495,19 @@ static int stm32_cryp_cpu_start(struct stm32_cryp *cryp)
 	return 0;
 }
 
+static int stm32_cryp_cipher_one_req(struct crypto_engine *engine, void *areq);
+static int stm32_cryp_prepare_cipher_req(struct crypto_engine *engine,
+					 void *areq);
+
 static int stm32_cryp_cra_init(struct crypto_tfm *tfm)
 {
+	struct stm32_cryp_ctx *ctx = crypto_tfm_ctx(tfm);
+
 	tfm->crt_ablkcipher.reqsize = sizeof(struct stm32_cryp_reqctx);
 
+	ctx->enginectx.op.do_one_request = stm32_cryp_cipher_one_req;
+	ctx->enginectx.op.prepare_request = stm32_cryp_prepare_cipher_req;
+	ctx->enginectx.op.unprepare_request = NULL;
 	return 0;
 }
 
@@ -513,7 +523,7 @@ static int stm32_cryp_crypt(struct ablkcipher_request *req, unsigned long mode)
 
 	rctx->mode = mode;
 
-	return crypto_transfer_cipher_request_to_engine(cryp->engine, req);
+	return crypto_transfer_ablkcipher_request_to_engine(cryp->engine, req);
 }
 
 static int stm32_cryp_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
@@ -695,14 +705,20 @@ static int stm32_cryp_prepare_req(struct crypto_engine *engine,
 }
 
 static int stm32_cryp_prepare_cipher_req(struct crypto_engine *engine,
-					 struct ablkcipher_request *req)
+					 void *areq)
 {
+	struct ablkcipher_request *req = container_of(areq,
+						      struct ablkcipher_request,
+						      base);
+
 	return stm32_cryp_prepare_req(engine, req);
 }
 
-static int stm32_cryp_cipher_one_req(struct crypto_engine *engine,
-				     struct ablkcipher_request *req)
+static int stm32_cryp_cipher_one_req(struct crypto_engine *engine, void *areq)
 {
+	struct ablkcipher_request *req = container_of(areq,
+						      struct ablkcipher_request,
+						      base);
 	struct stm32_cryp_ctx *ctx = crypto_ablkcipher_ctx(
 			crypto_ablkcipher_reqtfm(req));
 	struct stm32_cryp *cryp = ctx->cryp;
@@ -1104,9 +1120,6 @@ static int stm32_cryp_probe(struct platform_device *pdev)
 		goto err_engine1;
 	}
 
-	cryp->engine->prepare_cipher_request = stm32_cryp_prepare_cipher_req;
-	cryp->engine->cipher_one_request = stm32_cryp_cipher_one_req;
-
 	ret = crypto_engine_start(cryp->engine);
 	if (ret) {
 		dev_err(dev, "Could not start crypto engine\n");
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH v24 1/2] mm: support reporting free page blocks
From: Michael S. Tsirkin @ 2018-01-26 21:43 UTC (permalink / raw)
  To: Wei Wang
  Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
	liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
	akpm, virtualization
In-Reply-To: <20180126155224-mutt-send-email-mst@kernel.org>

On Fri, Jan 26, 2018 at 05:00:09PM +0200, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 11:29:15AM +0800, Wei Wang wrote:
> > On 01/25/2018 09:41 PM, Michael S. Tsirkin wrote:
> > > On Wed, Jan 24, 2018 at 06:42:41PM +0800, Wei Wang wrote:
> > > > This patch adds support to walk through the free page blocks in the
> > > > system and report them via a callback function. Some page blocks may
> > > > leave the free list after zone->lock is released, so it is the caller's
> > > > responsibility to either detect or prevent the use of such pages.
> > > > 
> > > > One use example of this patch is to accelerate live migration by skipping
> > > > the transfer of free pages reported from the guest. A popular method used
> > > > by the hypervisor to track which part of memory is written during live
> > > > migration is to write-protect all the guest memory. So, those pages that
> > > > are reported as free pages but are written after the report function
> > > > returns will be captured by the hypervisor, and they will be added to the
> > > > next round of memory transfer.
> > > > 
> > > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > > Cc: Michal Hocko <mhocko@kernel.org>
> > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > Acked-by: Michal Hocko <mhocko@kernel.org>
> > > > ---
> > > >   include/linux/mm.h |  6 ++++
> > > >   mm/page_alloc.c    | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >   2 files changed, 97 insertions(+)
> > > > 
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index ea818ff..b3077dd 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -1938,6 +1938,12 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
> > > >   		unsigned long zone_start_pfn, unsigned long *zholes_size);
> > > >   extern void free_initmem(void);
> > > > +extern void walk_free_mem_block(void *opaque,
> > > > +				int min_order,
> > > > +				bool (*report_pfn_range)(void *opaque,
> > > > +							 unsigned long pfn,
> > > > +							 unsigned long num));
> > > > +
> > > >   /*
> > > >    * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
> > > >    * into the buddy system. The freed pages will be poisoned with pattern
> > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > index 76c9688..705de22 100644
> > > > --- a/mm/page_alloc.c
> > > > +++ b/mm/page_alloc.c
> > > > @@ -4899,6 +4899,97 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> > > >   	show_swap_cache_info();
> > > >   }
> > > > +/*
> > > > + * Walk through a free page list and report the found pfn range via the
> > > > + * callback.
> > > > + *
> > > > + * Return false if the callback requests to stop reporting. Otherwise,
> > > > + * return true.
> > > > + */
> > > > +static bool walk_free_page_list(void *opaque,
> > > > +				struct zone *zone,
> > > > +				int order,
> > > > +				enum migratetype mt,
> > > > +				bool (*report_pfn_range)(void *,
> > > > +							 unsigned long,
> > > > +							 unsigned long))
> > > > +{
> > > > +	struct page *page;
> > > > +	struct list_head *list;
> > > > +	unsigned long pfn, flags;
> > > > +	bool ret;
> > > > +
> > > > +	spin_lock_irqsave(&zone->lock, flags);
> > > > +	list = &zone->free_area[order].free_list[mt];
> > > > +	list_for_each_entry(page, list, lru) {
> > > > +		pfn = page_to_pfn(page);
> > > > +		ret = report_pfn_range(opaque, pfn, 1 << order);
> > > > +		if (!ret)
> > > > +			break;
> > > > +	}
> > > > +	spin_unlock_irqrestore(&zone->lock, flags);
> > > > +
> > > > +	return ret;
> > > > +}
> > > There are two issues with this API. One is that it is not
> > > restarteable: if you return false, you start from the
> > > beginning. So no way to drop lock, do something slow
> > > and then proceed.
> > > 
> > > Another is that you are using it to report free page hints. Presumably
> > > the point is to drop these pages - keeping them near head of the list
> > > and reusing the reported ones will just make everything slower
> > > invalidating the hint.
> > > 
> > > How about rotating these pages towards the end of the list?
> > > Probably not on each call, callect reported pages and then
> > > move them to tail when we exit.
> > 
> > 
> > I'm not sure how this would help. For example, we have a list of 2M free
> > page blocks:
> > A-->B-->C-->D-->E-->F-->G--H
> > 
> > After reporting A and B, and put them to the end and exit, when the caller
> > comes back,
> > 1) if the list remains unchanged, then it will be
> > C-->D-->E-->F-->G-->H-->A-->B
> 
> Right. So here we can just scan until we see A, right?  It's a harder
> question what to do if A and only A has been consumed.  We don't want B
> to be sent twice ideally. OTOH maybe that isn't a big deal if it's only
> twice. Host might know page is already gone - how about host gives us a
> hint after using the buffer?
> 
> > 2) If worse, all the blocks have been split into smaller blocks and used
> > after the caller comes back.
> > 
> > where could we continue?
> 
> I'm not sure. But an alternative appears to be to hold a lock
> and just block whoever wanted to use any pages.  Yes we are sending
> hints faster but apparently something wanted these pages, and holding
> the lock is interfering with this something.

I've been thinking about it. How about the following scheme:
1. register balloon to get a (new) callback when free list runs empty
2. take pages off the free list, add them to the balloon specific list
3. report to host
4. readd to free list at tail
5. if callback triggers, interrupt balloon reporting to host,
   and readd to free list at tail


This needs some thought wrt what happens when there are
multiple users of this API, but looks like it will work.

-- 
MST

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Siwei Liu @ 2018-01-26 21:46 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Jakub Kicinski,
	Netdev, virtualization, David Miller
In-Reply-To: <731db0a0-85e7-a88e-6e0e-c540086347b5@intel.com>

On Fri, Jan 26, 2018 at 8:51 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 1/26/2018 12:14 AM, Siwei Liu wrote:
>>
>> On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin <mst@redhat.com>
>> wrote:
>>>
>>> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
>>>>
>>>> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com>
>>>> wrote:
>>>>>
>>>>> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>>>>>>
>>>>>> First off, as mentioned in another thread, the model of stacking up
>>>>>> virt-bond functionality over virtio seems a wrong direction to me.
>>>>>> Essentially the migration process would need to carry over all guest
>>>>>> side configurations previously done on the VF/PT and get them moved to
>>>>>> the new device being it virtio or VF/PT.
>>>>>
>>>>> I might be wrong but I don't see why we should worry about this
>>>>> usecase.
>>>>> Whoever has a bond configured already has working config for migration.
>>>>> We are trying to help people who don't, not convert existig users.
>>>>
>>>> That has been placed in the view of cloud providers that the imported
>>>> images from the store must be able to run unmodified thus no
>>>> additional setup script is allowed (just as Stephen mentioned in
>>>> another mail). Cloud users don't care about live migration themselves
>>>> but the providers are required to implement such automation mechanism
>>>> to make this process transparent if at all possible. The user does not
>>>> care about the device underneath being VF or not, but they do care
>>>> about consistency all across and the resulting performance
>>>> acceleration in making VF the prefered datapath. It is not quite
>>>> peculiar user cases but IMHO *any* approach proposed for live
>>>> migration should be able to persist the state including network config
>>>> e.g. as simple as MTU. Actually this requirement has nothing to do
>>>> with virtio but our target users are live migration agnostic, being it
>>>> tracking DMA through dirty pages, using virtio as the helper, or
>>>> whatsoever, the goal of persisting configs across remains same.
>>>
>>> So the patching being discussed here will mostly do exactly that if your
>>> original config was simply a single virtio net device.
>>>
>> True, but I don't see the patch being discussed starts with good
>> foundation of supporting the same for VF/PT device. That is the core
>> of the issue.
>
>
>>> What kind of configs do your users have right now?
>>
>> Any configs be it generic or driver specific that the VF/PT device
>> supports and have been enabled/configured. General network configs
>> (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
>> (hardware offload, # of queues and ring entris, RSC options, rss
>> rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
>> flower offload, just to name a few. As cloud providers we don't limit
>> users from applying driver specific tuning to the NIC/VF, and
>> sometimes this is essential to achieving best performance for their
>> workload. We've seen cases like tuning coalescing parameters for
>> getting low latency, changing rx-flow-hash function for better VXLAN
>> throughput, or even adopting quite advanced NIC features such as flow
>> director or cloud filter. We don't expect users to compromise even a
>> little bit on these. That is once we turn on live migration for the VF
>> or pass through devices in the VM, it all takes place under the hood,
>> users (guest admins, applications) don't have to react upon it or even
>> notice the change. I should note that the majority of live migrations
>> take place between machines with completely identical hardware, it's
>> more critical than necessary to keep the config as-is across the move,
>> stealth while quiet.
>
>
> This usecase is much more complicated and different than what this patch is
> trying
> to address.

Yep, it is technically difficult, but as cloud providers we would like
to take actions to address use case for our own if no one else is
willing to do so. However we're not seeking complicated design or
messing up the others such as your use case. As this is the first time
a real patch of the PV failover approach, although having be discussed
for years, posted to the mailing list. All voices suddenly came over,
various parties wish their specific needs added to the todo list, it's
indeed hard to accommodate all at once in the first place. I went
through same tough period of time while I was doing similar work so I
completely understand that. The task is not easy for sure. :)

The attempts I made was trying to consolidate all potential use cases
into one single solution rather than diverge from the very beginning.
It's in the phase of RFC and I don't want to wait expressing our
interest until very late.

>  Also your usecase seems to be assuming that source and
> destination
> hosts are identical and have the same HW.

Not exactly, this will be positioned as an optimization, but it is
crucial to our use case. While for the generic case with non-equal HW,
we can find out the least common denominator and apply those configs
that the new VF or the para virt driver can support without comprising
too much.

>
> It makes it pretty hard to transparently migrate all these settings with
> live
> migration when we are looking at a solution that unplugs the VF interface
> from
> the host and the VF driver is unloaded.
>
>
>> As you see generic bond or bridge cannot suffice the need. That's why
>> we need a new customized virt bond driver, and tailor it for VM live
>> migration specifically. Leveraging para-virtual e.g. virtio net device
>> as the backup path is one thing, tracking through driver config
>> changes in order to re-config as necessary is another. I would think
>> without considering the latter, the proposal being discussed is rather
>> incomplete, and remote to be useful in production.
>>
>>>
>>>>>> Without the help of a new
>>>>>> upper layer bond driver that enslaves virtio and VF/PT devices
>>>>>> underneath, virtio will be overloaded with too much specifics being a
>>>>>> VF/PT backup in the future.
>>>>>
>>>>> So this paragraph already includes at least two conflicting
>>>>> proposals. On the one hand you want a separate device for
>>>>> the virtual bond, on the other you are saying a separate
>>>>> driver.
>>>>
>>>> Just to be crystal clear: separate virtual bond device (netdev ops,
>>>> not necessarily bus device) for VM migration specifically with a
>>>> separate driver.
>>>
>>> Okay, but note that any config someone had on a virtio device won't
>>> propagate to that bond.
>>>
>>>>> Further, the reason to have a separate *driver* was that
>>>>> some people wanted to share code with netvsc - and that
>>>>> one does not create a separate device, which you can't
>>>>> change without breaking existing configs.
>>>>
>>>> I'm not sure I understand this statement. netvsc is already another
>>>> netdev being created than the enslaved VF netdev, why it bothers?
>>>
>>> Because it shipped, so userspace ABI is frozen.  You can't really add a
>>> netdevice and enslave an existing one without a risk of breaking some
>>> userspace configs.
>>>
>> I still don't understand this concern. Like said, before this patch
>> becomes reality, users interact with raw VF interface all the time.
>> Now this patch introduces a virtio net devive and enslave the VF.
>> Users have to interact with two interfaces - IP address and friends
>> configured on the VF will get lost and users have to reconfigure
>> virtio all over again. But some other configs e.g. ethtool needs to
>> remain on the VF. How does it guarantee existing configs won't broken?
>> Appears to me this is nothing different than having both virtio and VF
>> netdevs enslaved and users operates on the virt-bond interface
>> directly.
>
>
> Yes. This patch doesn't transparently provide live migration support to
> existing
> network configurations that only use a VF.  In order to get live migration
> support,
> for a VF only image, the network configuration has to change.
>
> It provides hypervisor controlled VF acceleration to existing virtio_net
> based network
> configurations in a transparent manner.

OK. But will you plan to address the former: transparently provide
support for existing network configuration with separate patches in
future? Say if go with your 2 netdevs approach.

>
>
>>
>> One thing I'd like to point out is the configs are mostly done in the
>> control plane. It's entirely possible to separate the data and control
>> paths in the new virt-bond driver: in the data plane, it may bypass
>> the virt-bond layer and quickly fall through to the data path of
>> virtio or VF slave; while in the control plane, the virt-bond may
>> disguise itself as the active slave, delegate the config changes to
>> the real driver, relay and expose driver config/state to the user. By
>> doing that the users and userspace applications just interact with one
>> single interface, the same way they interacted with the VF interface
>> as before. Users don't have to deal with the other two enslaved
>> interfaces directly - those automatically enslaved devices should be
>> made invisible from userspace applications and admins, and/or be
>> masked out from regular access by existing kernel APIs.
>>
>> I don't find it a good reason to reject the idea if we can sort out
>> ways not to break existing ABI or APIs.
>>
>>
>>>> In
>>>> the Azure case, the stock image to be imported does not bind to a
>>>> specific driver but only MAC address.
>>>
>>> I'll let netvsc developers decide this, on the surface I don't think
>>> it's reasonable to assume everyone only binds to a MAC.
>>
>> Sure. The point I wanted to make was that cloud providers are super
>> elastic in provisioning images - those driver or device specifics
>> should have been dehydrated from the original images thus make it
>> flexible enough to deploy to machines with vast varieties of hardware.
>> Although it's not necessarily the case everyone binds to a MAC, it's
>> worth taking a look at what the target users are doing and what the
>> pain points really are and understand what could be done to solve core
>> problems. Hyper-V netvsc can also benefit once moved to it, I'd
>> believe.
>>
>>>
>>>> And people just deal with the
>>>> new virt-bond netdev rather than the underlying virtio and VF. And
>>>> both these two underlying netdevs should be made invisible to prevent
>>>> userspace script from getting them misconfigured IMHO.
>>>>
>>>> A separate driver was for code sharing for sure, only just netvsc but
>>>> could be other para-virtual devices floating around: any PV can serve
>>>> as the side channel and the backup path for VF/PT. Once we get the new
>>>> driver working atop virtio we may define ops and/or protocol needed to
>>>> talk to various other PV frontend that may implement the side channel
>>>> of its own for datapath switching (e.g. virtio is one of them, Xen PV
>>>> frontend can be another). I just don't like to limit the function to
>>>> virtio only and we have to duplicate code then it starts to scatter
>>>> around all over the places.
>>>>
>>>> I understand right now we start it as simple so it may just be fine
>>>> that the initial development activities center around virtio. However,
>>>> from cloud provider/vendor perspective I don't see the proposed scheme
>>>> limits to virtio only. Any other PV driver which has the plan to
>>>> support the same scheme can benefit. The point is that we shouldn't be
>>>> limiting the scheme to virtio specifics so early which is hard to have
>>>> it promoted to a common driver once we get there.
>>>
>>> The whole idea has been floating around for years. It would always
>>> get being drowned in this kind of "lets try to cover all use-cases"
>>> discussions, and never make progress.
>>> So let's see some working code merged. If it works fine for virtio
>>> and turns out to be a good fit for netvsc, we can share code.
>>
>> I think we at least should start with a separate netdev other than
>> virtio. That is what we may agree to have to do without comprise I'd
>> hope.
>
> I think the usecase that you are targeting does require a new para virt bond
> driver
> and a new type of netdevice.
>
> For the usecase where the host is doing all the guest network
> tuning/optimizations

I'm not particularly sure about this use case for how the host is
capable of doing guest tuning/optimization, since you may be just
passing a pass-through device to the guest. Or are you talking about
VF specifically?

> and the VM is not expected to do any tuning/optimizations on the VF driver
> directly,
> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> and vf) should
> work fine.

OK. For your use case that's fine. But that's too specific scenario
with lots of restrictions IMHO, perhaps very few users will benefit
from it, I'm not sure. If you're unwilling to move towards it, we'd
take this one and come back with a generic solution that is able to
address general use cases for VF/PT live migration .


Regards,
-Siwei

>>>
>>>
>>>>> So some people want a fully userspace-configurable switchdev, and that
>>>>> already exists at some level, and maybe it makes sense to add more
>>>>> features for performance.
>>>>>
>>>>> But the point was that some host configurations are very simple,
>>>>> and it probably makes sense to pass this information to the guest
>>>>> and have guest act on it directly. Let's not conflate the two.
>>>>
>>>> It may be fine to push some of the configurations from host but that
>>>> perhaps doesn't cover all the cases: how is it possible for the host
>>>> to save all network states and configs done by the guest before
>>>> migration. Some of the configs might come from future guest which is
>>>> unknown to host. Anyhow the bottom line is that the guest must be able
>>>> to act on those configuration request changes automatically without
>>>> involving users intervention.
>>>>
>>>> Regards,
>>>> -Siwei
>>>
>>> All use-cases are *already* covered by existing kernel APIs.  Just use a
>>> bond, or a bridge, or whatever. It's just that they are so generic and
>>> hard to use, that userspace to do it never surfaced.
>>
>> As mentioned earlier, for which I cannot stress enough, the existing
>> generic bond or bridge doesn't work. We need a new net device that
>> works for live migration specifically and fits it well.
>>
>>> So I am interested in some code that handles some simple use-cases
>>> in the kernel, with a simple kernel API.
>>
>> It should be fine, I like simple stuffs too and wouldn't like to make
>> complications. The concept of hiding auto-managed interfaces is not
>> new and has even been implemented by other operating systems already.
>> Not sure if that is your compatibility concern. We start with simple
>> for sure, but simple != in-expandable then make potential users
>> impossible to use at all.
>>
>>
>> Thanks,
>> -Siwei
>>
>>>>> --
>>>>> MST
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
>> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Michael S. Tsirkin @ 2018-01-26 22:14 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Alexander Duyck, virtio-dev, Jakub Kicinski, Samudrala, Sridhar,
	virtualization, Netdev, David Miller
In-Reply-To: <CADGSJ205yeyGV2nJ9Cr=FwPypxAHb12jx3WitS6UmWSJ9VP42Q@mail.gmail.com>

On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > and the VM is not expected to do any tuning/optimizations on the VF driver
> > directly,
> > i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > and vf) should
> > work fine.
> 
> OK. For your use case that's fine. But that's too specific scenario
> with lots of restrictions IMHO, perhaps very few users will benefit
> from it, I'm not sure. If you're unwilling to move towards it, we'd
> take this one and come back with a generic solution that is able to
> address general use cases for VF/PT live migration .

I think that's a fine approach. Scratch your own itch!  I imagine a very
generic virtio-switchdev providing host routing info to guests could
address lots of usecases. A driver could bind to that one and enslave
arbitrary other devices.  Sounds reasonable.

But given the fundamental idea of a failover was floated at least as
early as 2013, and made 0 progress since precisely because it kept
trying to address more and more features, and given netvsc is already
using the basic solution with some success, I'm not inclined to block
this specific effort waiting for the generic one.

-- 
MST

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Jakub Kicinski @ 2018-01-26 22:47 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexander Duyck, virtio-dev, Netdev, virtualization, Siwei Liu,
	Samudrala, Sridhar, David Miller
In-Reply-To: <20180127000831-mutt-send-email-mst@kernel.org>

On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > > and the VM is not expected to do any tuning/optimizations on the VF driver
> > > directly,
> > > i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > > and vf) should
> > > work fine.  
> > 
> > OK. For your use case that's fine. But that's too specific scenario
> > with lots of restrictions IMHO, perhaps very few users will benefit
> > from it, I'm not sure. If you're unwilling to move towards it, we'd
> > take this one and come back with a generic solution that is able to
> > address general use cases for VF/PT live migration .  
> 
> I think that's a fine approach. Scratch your own itch!  I imagine a very
> generic virtio-switchdev providing host routing info to guests could
> address lots of usecases. A driver could bind to that one and enslave
> arbitrary other devices.  Sounds reasonable.
> 
> But given the fundamental idea of a failover was floated at least as
> early as 2013, and made 0 progress since precisely because it kept
> trying to address more and more features, and given netvsc is already
> using the basic solution with some success, I'm not inclined to block
> this specific effort waiting for the generic one.

I think there is an agreement that the extra netdev will be useful for
more advanced use cases, and is generally preferable.  What is the
argument for not doing that from the start?  If it was made I must have
missed it.  Is it just unwillingness to write the extra 300 lines of
code?  Sounds like a pretty weak argument when adding kernel ABI is at
stake...

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-01-26 23:30 UTC (permalink / raw)
  To: Jakub Kicinski, Michael S. Tsirkin
  Cc: Alexander Duyck, virtio-dev, Netdev, virtualization, Siwei Liu,
	David Miller
In-Reply-To: <20180126144704.6e1a9628@cakuba.netronome.com>


On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
> On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>>>> directly,
>>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>>>> and vf) should
>>>> work fine.
>>> OK. For your use case that's fine. But that's too specific scenario
>>> with lots of restrictions IMHO, perhaps very few users will benefit
>>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>>> take this one and come back with a generic solution that is able to
>>> address general use cases for VF/PT live migration .
>> I think that's a fine approach. Scratch your own itch!  I imagine a very
>> generic virtio-switchdev providing host routing info to guests could
>> address lots of usecases. A driver could bind to that one and enslave
>> arbitrary other devices.  Sounds reasonable.
>>
>> But given the fundamental idea of a failover was floated at least as
>> early as 2013, and made 0 progress since precisely because it kept
>> trying to address more and more features, and given netvsc is already
>> using the basic solution with some success, I'm not inclined to block
>> this specific effort waiting for the generic one.
> I think there is an agreement that the extra netdev will be useful for
> more advanced use cases, and is generally preferable.  What is the
> argument for not doing that from the start?  If it was made I must have
> missed it.  Is it just unwillingness to write the extra 300 lines of
> code?  Sounds like a pretty weak argument when adding kernel ABI is at
> stake...

I am still not clear on the need for the extra netdev created by 
virtio_net. The only advantage
i can see is that the stats can be broken between VF and virtio 
datapaths compared
to the aggregrated stats on virtio netdev as seen with the 2 netdev 
approach.

With 2 netdev model, any VM image that has a working network 
configuration will transparently get
VF based acceleration without any changes. 3 netdev model breaks this 
configuration starting with the
creation and naming of the 2 devices to udev needing to be aware of 
master and slave virtio-net devices.
Also, from a user experience point of view, loading a virtio-net with 
BACKUP feature
enabled will  now show 2 virtio-net netdevs.

For live migration with advanced usecases that Siwei is suggesting, i 
think we need a new driver
with a new device type that can track the VF specific feature settings 
even when the VF driver is unloaded.

Thanks
Sridhar


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Jakub Kicinski @ 2018-01-27  2:30 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Netdev,
	virtualization, Siwei Liu, David Miller
In-Reply-To: <5fa8c6c6-4a94-91fa-fdbb-ee7b624d703f@intel.com>

On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
> On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
> > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:  
> >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:  
> >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
> >>>> directly,
> >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> >>>> and vf) should
> >>>> work fine.  
> >>> OK. For your use case that's fine. But that's too specific scenario
> >>> with lots of restrictions IMHO, perhaps very few users will benefit
> >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
> >>> take this one and come back with a generic solution that is able to
> >>> address general use cases for VF/PT live migration .  
> >> I think that's a fine approach. Scratch your own itch!  I imagine a very
> >> generic virtio-switchdev providing host routing info to guests could
> >> address lots of usecases. A driver could bind to that one and enslave
> >> arbitrary other devices.  Sounds reasonable.
> >>
> >> But given the fundamental idea of a failover was floated at least as
> >> early as 2013, and made 0 progress since precisely because it kept
> >> trying to address more and more features, and given netvsc is already
> >> using the basic solution with some success, I'm not inclined to block
> >> this specific effort waiting for the generic one.  
> > I think there is an agreement that the extra netdev will be useful for
> > more advanced use cases, and is generally preferable.  What is the
> > argument for not doing that from the start?  If it was made I must have
> > missed it.  Is it just unwillingness to write the extra 300 lines of
> > code?  Sounds like a pretty weak argument when adding kernel ABI is at
> > stake...  
> 
> I am still not clear on the need for the extra netdev created by 
> virtio_net. The only advantage i can see is that the stats can be
> broken between VF and virtio datapaths compared to the aggregrated
> stats on virtio netdev as seen with the 2 netdev approach.

Maybe you're not convinced but multiple arguments were made.

> With 2 netdev model, any VM image that has a working network 
> configuration will transparently get VF based acceleration without
> any changes. 

Nothing happens transparently.  Things may happen automatically.  The
VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
something it did not use to be.  And configures and reports some
information from the PV (e.g. speed) but PV doesn't pass traffic any
longer.

> 3 netdev model breaks this configuration starting with the creation
> and naming of the 2 devices to udev needing to be aware of master and
> slave virtio-net devices.

I don't understand this comment.  There is one virtio-net device and
one "virtio-bond" netdev.  And user space has to be aware of the special
automatic arrangement anyway, because it can't touch the VF.  It
doesn't make any difference whether it ignores the VF or PV and VF.
It simply can't touch the slaves, no matter how many there are.

> Also, from a user experience point of view, loading a virtio-net with
> BACKUP feature enabled will now show 2 virtio-net netdevs.

One virtio-net and one virtio-bond, which represents what's happening.

> For live migration with advanced usecases that Siwei is suggesting, i 
> think we need a new driver with a new device type that can track the
> VF specific feature settings even when the VF driver is unloaded.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-01-27  5:33 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Netdev,
	virtualization, Siwei Liu, David Miller
In-Reply-To: <20180126183003.591cd5c5@cakuba.netronome.com>

On 1/26/2018 6:30 PM, Jakub Kicinski wrote:
> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
>> On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
>>> On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>>>> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>>>>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>>>>>> directly,
>>>>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>>>>>> and vf) should
>>>>>> work fine.
>>>>> OK. For your use case that's fine. But that's too specific scenario
>>>>> with lots of restrictions IMHO, perhaps very few users will benefit
>>>>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>>>>> take this one and come back with a generic solution that is able to
>>>>> address general use cases for VF/PT live migration .
>>>> I think that's a fine approach. Scratch your own itch!  I imagine a very
>>>> generic virtio-switchdev providing host routing info to guests could
>>>> address lots of usecases. A driver could bind to that one and enslave
>>>> arbitrary other devices.  Sounds reasonable.
>>>>
>>>> But given the fundamental idea of a failover was floated at least as
>>>> early as 2013, and made 0 progress since precisely because it kept
>>>> trying to address more and more features, and given netvsc is already
>>>> using the basic solution with some success, I'm not inclined to block
>>>> this specific effort waiting for the generic one.
>>> I think there is an agreement that the extra netdev will be useful for
>>> more advanced use cases, and is generally preferable.  What is the
>>> argument for not doing that from the start?  If it was made I must have
>>> missed it.  Is it just unwillingness to write the extra 300 lines of
>>> code?  Sounds like a pretty weak argument when adding kernel ABI is at
>>> stake...
>> I am still not clear on the need for the extra netdev created by
>> virtio_net. The only advantage i can see is that the stats can be
>> broken between VF and virtio datapaths compared to the aggregrated
>> stats on virtio netdev as seen with the 2 netdev approach.
> Maybe you're not convinced but multiple arguments were made.
All the arguments seem to either saying that semantically this doesn't 
look like
a bond OR suggesting usecases that this patch is not trying to solve.
This approach should help cloud environments where the guest networking 
is fully
controlled from the hypervisor via the PF driver or via port representor 
when switchdev
mode is enabled. The guest admin is not expected or allowed to make any 
networking
changes from the VM.

>
>> With 2 netdev model, any VM image that has a working network
>> configuration will transparently get VF based acceleration without
>> any changes.
> Nothing happens transparently.  Things may happen automatically.  The
> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
> something it did not use to be.  And configures and reports some
> information from the PV (e.g. speed) but PV doesn't pass traffic any
> longer.

>> 3 netdev model breaks this configuration starting with the creation
>> and naming of the 2 devices to udev needing to be aware of master and
>> slave virtio-net devices.
> I don't understand this comment.  There is one virtio-net device and
> one "virtio-bond" netdev.  And user space has to be aware of the special
> automatic arrangement anyway, because it can't touch the VF.  It
> doesn't make any difference whether it ignores the VF or PV and VF.
> It simply can't touch the slaves, no matter how many there are.

If the userspace is not expected to touch the slaves, then why do we need to
take extra effort to expose a netdev that is just not really useful.

>
>> Also, from a user experience point of view, loading a virtio-net with
>> BACKUP feature enabled will now show 2 virtio-net netdevs.
> One virtio-net and one virtio-bond, which represents what's happening.
This again assumes that we want to represent a bond setup. Can't we 
treat this
as virtio-net providing an alternate low-latency datapath by taking over 
VF datapath?
>
>> For live migration with advanced usecases that Siwei is suggesting, i
>> think we need a new driver with a new device type that can track the
>> VF specific feature settings even when the VF driver is unloaded.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Jakub Kicinski @ 2018-01-27  5:58 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Netdev,
	virtualization, Siwei Liu, David Miller
In-Reply-To: <e02e21a1-ab5d-132a-fb33-5e0ba42fc9ac@intel.com>

On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
> >> 3 netdev model breaks this configuration starting with the creation
> >> and naming of the 2 devices to udev needing to be aware of master and
> >> slave virtio-net devices.  
> > I don't understand this comment.  There is one virtio-net device and
> > one "virtio-bond" netdev.  And user space has to be aware of the special
> > automatic arrangement anyway, because it can't touch the VF.  It
> > doesn't make any difference whether it ignores the VF or PV and VF.
> > It simply can't touch the slaves, no matter how many there are.  
> 
> If the userspace is not expected to touch the slaves, then why do we need to
> take extra effort to expose a netdev that is just not really useful.

You said:
"[user space] needs to be aware of master and slave virtio-net devices."

I'm saying:
It has to be aware of the special arrangement whether there is an
explicit bond netdev or not.

> >> Also, from a user experience point of view, loading a virtio-net with
> >> BACKUP feature enabled will now show 2 virtio-net netdevs.  
> > One virtio-net and one virtio-bond, which represents what's happening.  
> This again assumes that we want to represent a bond setup. Can't we 
> treat this
> as virtio-net providing an alternate low-latency datapath by taking over 
> VF datapath?

Bond is just a familiar name, we can call it something else if you
prefer.  The point is there are two data paths which can have
independent low-level settings and a higher level entity with
global settings which represents any path to the outside world.

Hiding low-level netdevs from a lay user requires a generic solution.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* RE: [PATCH v24 1/2] mm: support reporting free page blocks
From: Wang, Wei W @ 2018-01-27 13:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	riel@redhat.com, quan.xu0@gmail.com, kvm@vger.kernel.org,
	nilal@redhat.com, liliang.opensource@gmail.com,
	linux-kernel@vger.kernel.org, mhocko@kernel.org,
	linux-mm@kvack.org, pbonzini@redhat.com,
	akpm@linux-foundation.org,
	virtualization@lists.linux-foundation.org
In-Reply-To: <20180126233950-mutt-send-email-mst@kernel.org>

On Saturday, January 27, 2018 5:44 AM, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 05:00:09PM +0200, Michael S. Tsirkin wrote:
> > On Fri, Jan 26, 2018 at 11:29:15AM +0800, Wei Wang wrote:
> > > On 01/25/2018 09:41 PM, Michael S. Tsirkin wrote:
> > > > On Wed, Jan 24, 2018 at 06:42:41PM +0800, Wei Wang wrote:
> > > 2) If worse, all the blocks have been split into smaller blocks and
> > > used after the caller comes back.
> > >
> > > where could we continue?
> >
> > I'm not sure. But an alternative appears to be to hold a lock and just
> > block whoever wanted to use any pages.  Yes we are sending hints
> > faster but apparently something wanted these pages, and holding the
> > lock is interfering with this something.
> 
> I've been thinking about it. How about the following scheme:
> 1. register balloon to get a (new) callback when free list runs empty 2. take
> pages off the free list, add them to the balloon specific list 3. report to host 4.
> readd to free list at tail 5. if callback triggers, interrupt balloon reporting to
> host,
>    and readd to free list at tail

So in step 2, when we walk through the free page list, take each block, and add them to the balloon specific list, is this performed under the mm lock? If it is still under the lock, then what would be the difference compared to walking through the free list, and add each block to virtqueue? 

Best,
Wei

^ permalink raw reply

* RE: [PATCH v24 1/2] mm: support reporting free page blocks
From: Wang, Wei W @ 2018-01-27 14:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
	riel@redhat.com, quan.xu0@gmail.com, kvm@vger.kernel.org,
	nilal@redhat.com, liliang.opensource@gmail.com,
	linux-kernel@vger.kernel.org, mhocko@kernel.org,
	linux-mm@kvack.org, pbonzini@redhat.com,
	akpm@linux-foundation.org,
	virtualization@lists.linux-foundation.org
In-Reply-To: <20180126155224-mutt-send-email-mst@kernel.org>

On Friday, January 26, 2018 11:00 PM, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 11:29:15AM +0800, Wei Wang wrote:
> > On 01/25/2018 09:41 PM, Michael S. Tsirkin wrote:
> > > On Wed, Jan 24, 2018 at 06:42:41PM +0800, Wei Wang wrote:
> > > > This patch adds support to walk through the free page blocks in
> > > > the system and report them via a callback function. Some page
> > > > blocks may leave the free list after zone->lock is released, so it
> > > > is the caller's responsibility to either detect or prevent the use of such
> pages.
> > > >
> > > > One use example of this patch is to accelerate live migration by
> > > > skipping the transfer of free pages reported from the guest. A
> > > > popular method used by the hypervisor to track which part of
> > > > memory is written during live migration is to write-protect all
> > > > the guest memory. So, those pages that are reported as free pages
> > > > but are written after the report function returns will be captured
> > > > by the hypervisor, and they will be added to the next round of memory
> transfer.
> > > >
> > > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > > Cc: Michal Hocko <mhocko@kernel.org>
> > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > Acked-by: Michal Hocko <mhocko@kernel.org>
> > > > ---
> > > >   include/linux/mm.h |  6 ++++
> > > >   mm/page_alloc.c    | 91
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >   2 files changed, 97 insertions(+)
> > > >
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h index
> > > > ea818ff..b3077dd 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -1938,6 +1938,12 @@ extern void free_area_init_node(int nid,
> unsigned long * zones_size,
> > > >   		unsigned long zone_start_pfn, unsigned long *zholes_size);
> > > >   extern void free_initmem(void);
> > > > +extern void walk_free_mem_block(void *opaque,
> > > > +				int min_order,
> > > > +				bool (*report_pfn_range)(void *opaque,
> > > > +							 unsigned long pfn,
> > > > +							 unsigned long num));
> > > > +
> > > >   /*
> > > >    * Free reserved pages within range [PAGE_ALIGN(start), end &
> PAGE_MASK)
> > > >    * into the buddy system. The freed pages will be poisoned with
> > > > pattern diff --git a/mm/page_alloc.c b/mm/page_alloc.c index
> > > > 76c9688..705de22 100644
> > > > --- a/mm/page_alloc.c
> > > > +++ b/mm/page_alloc.c
> > > > @@ -4899,6 +4899,97 @@ void show_free_areas(unsigned int filter,
> nodemask_t *nodemask)
> > > >   	show_swap_cache_info();
> > > >   }
> > > > +/*
> > > > + * Walk through a free page list and report the found pfn range
> > > > +via the
> > > > + * callback.
> > > > + *
> > > > + * Return false if the callback requests to stop reporting.
> > > > +Otherwise,
> > > > + * return true.
> > > > + */
> > > > +static bool walk_free_page_list(void *opaque,
> > > > +				struct zone *zone,
> > > > +				int order,
> > > > +				enum migratetype mt,
> > > > +				bool (*report_pfn_range)(void *,
> > > > +							 unsigned long,
> > > > +							 unsigned long))
> > > > +{
> > > > +	struct page *page;
> > > > +	struct list_head *list;
> > > > +	unsigned long pfn, flags;
> > > > +	bool ret;
> > > > +
> > > > +	spin_lock_irqsave(&zone->lock, flags);
> > > > +	list = &zone->free_area[order].free_list[mt];
> > > > +	list_for_each_entry(page, list, lru) {
> > > > +		pfn = page_to_pfn(page);
> > > > +		ret = report_pfn_range(opaque, pfn, 1 << order);
> > > > +		if (!ret)
> > > > +			break;
> > > > +	}
> > > > +	spin_unlock_irqrestore(&zone->lock, flags);
> > > > +
> > > > +	return ret;
> > > > +}
> > > There are two issues with this API. One is that it is not
> > > restarteable: if you return false, you start from the beginning. So
> > > no way to drop lock, do something slow and then proceed.
> > >
> > > Another is that you are using it to report free page hints.
> > > Presumably the point is to drop these pages - keeping them near head
> > > of the list and reusing the reported ones will just make everything
> > > slower invalidating the hint.
> > >
> > > How about rotating these pages towards the end of the list?
> > > Probably not on each call, callect reported pages and then move them
> > > to tail when we exit.
> >
> >
> > I'm not sure how this would help. For example, we have a list of 2M
> > free page blocks:
> > A-->B-->C-->D-->E-->F-->G--H
> >
> > After reporting A and B, and put them to the end and exit, when the
> > caller comes back,
> > 1) if the list remains unchanged, then it will be
> > C-->D-->E-->F-->G-->H-->A-->B
> 
> Right. So here we can just scan until we see A, right?  It's a harder question
> what to do if A and only A has been consumed.  We don't want B to be sent
> twice ideally. OTOH maybe that isn't a big deal if it's only twice. Host might
> know page is already gone - how about host gives us a hint after using the
> buffer?
> 
> > 2) If worse, all the blocks have been split into smaller blocks and
> > used after the caller comes back.
> >
> > where could we continue?
> 
> I'm not sure. But an alternative appears to be to hold a lock and just block
> whoever wanted to use any pages.  Yes we are sending hints faster but
> apparently something wanted these pages, and holding the lock is interfering
> with this something.
> 
> >
> > The reason to think about "restart" is the worry about the virtqueue
> > is full, right? But we've agreed that losing some hints to report
> > isn't important, and in practice, the virtqueue won't be full as the
> > host side is faster.
> 
> It would be more convincing if we sent e.g. higher order pages first. As it is - it
> won't take long to stuff ring full of 4K pages and it seems highly unlikely that
> host won't ever be scheduled out.

Yes, actually we've already sent higher order pages first, please check this patch, we have:

for (order = MAX_ORDER - 1; order >= min_order; order--)
--> start from high order blocks. 

> 
> Can we maybe agree on what kind of benchmark makes sense for this work?
> I'm concerned that we are laser focused on just how long does it take to
> migrate ignoring e.g. slowdowns after migration.

Testing the time of linux compilation during migration? or what benchmark do you have in mind? 
We can compare how long it takes during legacy live migration and live migration with this feature?

If you really worry about this, we could also provide an configure option, e.g. under /sys/, for the guest to decide whether to enable or disable reporting free page hints to the host at any time. If disabled, in the balloon driver we skip the calling of walk_free_mem_block().


> > I'm concerned that actions on the free list may cause more controversy
> > though it might be safe to do from some aspect, and would be hard to
> > end debating. If possible, we could go with the most prudent approach
> > for now, and have more discussions in future improvement patches. What
> > would you think?
> 
> Well I'm not 100% about restartability. But keeping pages freed by host near
> head of the list looks kind of wrong.
> Try to float a patch on top for the rotation and see what happens?

I didn't get it, "pages freed by host", what does that mean?

Best,
Wei

^ permalink raw reply

* Re: Ping Re: [PATCH] virtio: make VIRTIO a menuconfig to ease disabling it all
From: Michael Ellerman @ 2018-01-28 10:14 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang, Vincent Legoll, sfr
  Cc: Randy Dunlap, linux-kernel, virtualization
In-Reply-To: <CAEwRq=oJ0apphUFDheoX5d-BB5yjG3rZD8m4yhK75eZcm6eqVg@mail.gmail.com>

Vincent Legoll <vincent.legoll@gmail.com> writes:
> On 1/23/18, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> This has been broken in linux-next for ~6 weeks now, can we please merge
>> this and get it fixed.
>
> Added Stephen Rothwell to cc

It should be fixed in the virtio tree, which the other Michael and Jason
maintain AFAIK.

cheers

^ permalink raw reply

* Re: [PATCH 4/6] x86: Consolidate PCI_MMCONFIG configs
From: Andy Shevchenko @ 2018-01-28 17:26 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas,
	Thomas Gleixner
In-Reply-To: <5dcaec50bb59140c1cb63c35ce566ccc02188bcb.1516601570.git.jan.kiszka@siemens.com>

On Mon, Jan 22, 2018 at 8:12 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> Not sure if those two worked by design or just by chance so far. In any
> case, it's at least cleaner and clearer to express this in a single
> config statement.

Congrats! You found by the way a bug in

commit e279b6c1d329e50b766bce96aacc197eae8a053b
Author: Sam Ravnborg <sam@ravnborg.org>
Date:   Tue Nov 6 20:41:05 2007 +0100

   x86: start unification of arch/x86/Kconfig.*

...and proper fix seems to split PCI stuff to common + X86_32 only + X86_64 only

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Alexander Duyck @ 2018-01-28 17:35 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin,
	Samudrala, Sridhar, virtualization, Siwei Liu, Netdev,
	David Miller
In-Reply-To: <20180126215829.7b3c6bac@cakuba.netronome.com>

On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>> >> 3 netdev model breaks this configuration starting with the creation
>> >> and naming of the 2 devices to udev needing to be aware of master and
>> >> slave virtio-net devices.
>> > I don't understand this comment.  There is one virtio-net device and
>> > one "virtio-bond" netdev.  And user space has to be aware of the special
>> > automatic arrangement anyway, because it can't touch the VF.  It
>> > doesn't make any difference whether it ignores the VF or PV and VF.
>> > It simply can't touch the slaves, no matter how many there are.
>>
>> If the userspace is not expected to touch the slaves, then why do we need to
>> take extra effort to expose a netdev that is just not really useful.
>
> You said:
> "[user space] needs to be aware of master and slave virtio-net devices."
>
> I'm saying:
> It has to be aware of the special arrangement whether there is an
> explicit bond netdev or not.

To clarify here the kernel should be aware that there is a device that
is an aggregate of 2 other devices. It isn't as if we need to insert
the new device "above" the virtio.

I have been doing a bit of messing around with a few ideas and it
seems like it would be better if we could replace the virtio interface
with the virtio-bond, renaming my virt-bond concept to this since it
is now supposedly going to live in the virtio driver, interface, and
then push the original virtio down one layer and call it a
virtio-backup. If I am not mistaken we could assign the same dev
pointer used by the virtio netdev to the virtio-bond, and if we
register it first with the "eth%d" name then udev will assume that the
virtio-bond device is the original virtio and all existing scripts
should just work with that. We then would want to change the name of
the virtio interface with the backup feature bit set, maybe call it
something like bkup-00:00:00 where the 00:00:00 would be the last 3
octets of the MAC address. It should solve the issue of inserting an
interface "above" the virtio by making the virtio-bond become the
virtio. The only limitation is that we will probably need to remove
the back-up if the virtio device is removed, however that was already
a limitation of this solution and others like the netvsc solution
anyway.

>> >> Also, from a user experience point of view, loading a virtio-net with
>> >> BACKUP feature enabled will now show 2 virtio-net netdevs.
>> > One virtio-net and one virtio-bond, which represents what's happening.
>> This again assumes that we want to represent a bond setup. Can't we
>> treat this
>> as virtio-net providing an alternate low-latency datapath by taking over
>> VF datapath?
>
> Bond is just a familiar name, we can call it something else if you
> prefer.  The point is there are two data paths which can have
> independent low-level settings and a higher level entity with
> global settings which represents any path to the outside world.
>
> Hiding low-level netdevs from a lay user requires a generic solution.

The last thing I think we should be doing is hiding the low level
netdevs. It doesn't solve anythying. We are already trusting the owner
of the VM enough to let them have root access of the VM. That means
they can load/unload any driver they want. They don't have to use the
kernel provided virtio driver, they could load their own. They could
even do something like run DPDK on top of it, or they could run DPDK
on top of the VF. In either case there is no way the bond would ever
be created and all hiding devices does is make it easier to fix
problems when something gets broken. Unless I am mistaken, and
"security through obscurity" has somehow become a valid security
model.

As I mentioned to Sridhar on an off-list thread I think the goal
should be to make it so that the user wants to use the virtio-bond,
not make it so that they have no choice but to use it. The idea is we
should be making things easier for the administrator of the VM, not
harder.

- Alex

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-01-28 19:18 UTC (permalink / raw)
  To: Alexander Duyck, Jakub Kicinski
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Netdev,
	virtualization, Siwei Liu, David Miller
In-Reply-To: <CAKgT0Ud4e9O-FoY242Kn_yBX7JWtL_i=yW+Y-MXQ9BGEVBEkgA@mail.gmail.com>

On 1/28/2018 9:35 AM, Alexander Duyck wrote:
> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>> slave virtio-net devices.
>>>> I don't understand this comment.  There is one virtio-net device and
>>>> one "virtio-bond" netdev.  And user space has to be aware of the special
>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>> It simply can't touch the slaves, no matter how many there are.
>>> If the userspace is not expected to touch the slaves, then why do we need to
>>> take extra effort to expose a netdev that is just not really useful.
>> You said:
>> "[user space] needs to be aware of master and slave virtio-net devices."
>>
>> I'm saying:
>> It has to be aware of the special arrangement whether there is an
>> explicit bond netdev or not.
> To clarify here the kernel should be aware that there is a device that
> is an aggregate of 2 other devices. It isn't as if we need to insert
> the new device "above" the virtio.
>
> I have been doing a bit of messing around with a few ideas and it
> seems like it would be better if we could replace the virtio interface
> with the virtio-bond, renaming my virt-bond concept to this since it
> is now supposedly going to live in the virtio driver, interface, and
> then push the original virtio down one layer and call it a
> virtio-backup. If I am not mistaken we could assign the same dev
> pointer used by the virtio netdev to the virtio-bond, and if we
> register it first with the "eth%d" name then udev will assume that the
> virtio-bond device is the original virtio and all existing scripts
> should just work with that. We then would want to change the name of
> the virtio interface with the backup feature bit set, maybe call it
> something like bkup-00:00:00 where the 00:00:00 would be the last 3
> octets of the MAC address. It should solve the issue of inserting an
> interface "above" the virtio by making the virtio-bond become the
> virtio. The only limitation is that we will probably need to remove
> the back-up if the virtio device is removed, however that was already
> a limitation of this solution and others like the netvsc solution
> anyway.

With 3 netdev model, if we make the the master virtio-net associated 
with the
real virtio pci device,  i think  the userspace scripts would not break.
If we go this route, i am still not clear on the purpose of exposing the 
bkup netdev.
Can we start with the 2 netdev model and move to 3 netdev model later if we
find out that there are limitiations with the 2 netdev model? I don't 
think this will
break any user API as the userspace is not expected to use the bkup netdev.

>
>>>>> Also, from a user experience point of view, loading a virtio-net with
>>>>> BACKUP feature enabled will now show 2 virtio-net netdevs.
>>>> One virtio-net and one virtio-bond, which represents what's happening.
>>> This again assumes that we want to represent a bond setup. Can't we
>>> treat this
>>> as virtio-net providing an alternate low-latency datapath by taking over
>>> VF datapath?
>> Bond is just a familiar name, we can call it something else if you
>> prefer.  The point is there are two data paths which can have
>> independent low-level settings and a higher level entity with
>> global settings which represents any path to the outside world.
>>
>> Hiding low-level netdevs from a lay user requires a generic solution.
> The last thing I think we should be doing is hiding the low level
> netdevs. It doesn't solve anythying. We are already trusting the owner
> of the VM enough to let them have root access of the VM. That means
> they can load/unload any driver they want. They don't have to use the
> kernel provided virtio driver, they could load their own. They could
> even do something like run DPDK on top of it, or they could run DPDK
> on top of the VF. In either case there is no way the bond would ever
> be created and all hiding devices does is make it easier to fix
> problems when something gets broken. Unless I am mistaken, and
> "security through obscurity" has somehow become a valid security
> model.
>
> As I mentioned to Sridhar on an off-list thread I think the goal
> should be to make it so that the user wants to use the virtio-bond,
> not make it so that they have no choice but to use it. The idea is we
> should be making things easier for the administrator of the VM, not
> harder.
>
>
When the hypervisor has enabled BACKUP bit along with a VF with the same
MAC address, the VM can use either VF only OR extended virtio with 
attached  VF.
Although there are 2 datapaths to the device, the hypervisor will enable 
only
one at any time.  The virtio via PF datapath is only enabled during live 
migration
when the VF is unplugged. If not, VF broadcasts/multicasts will get 
looped back
to the VM via the PF datapath. In the RFC path i was assuming that the 
VM can
use both datapaths and i had broadcasts/multicasts going over virtio 
datapath,
but i don' think it is a good idea.  It requires the device to disable 
broadcast
replication on the VFs.

Thanks
Sridhar
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* CISTI'2018 - Doctoral Symposium
From: Maria Lemos @ 2018-01-28 19:29 UTC (permalink / raw)
  To: virtualization


[-- Attachment #1.1: Type: text/plain, Size: 4313 bytes --]

* Proceedings published in IEEE Xplore and indexed by ISI, Scopus, etc.


---------------------------------------------------------------------------------------------------------------------------
Doctoral Symposium of CISTI'2018 - 13th Iberian Conference on Information Systems and Technologies
                                                   Caceres, Spain, 16 - 13 June 2018
                                                               http://www.cisti.eu/ <http://www.cisti.eu/>
------------------------------------------------------------------------------------------------------------------------------------



The purpose of CISTI'2018’s Doctoral Symposium is to provide graduate students a setting where they can, informally, expose and discuss their work, collecting valuable expert opinions and sharing new ideas, methods and applications. The Doctoral Symposium is an excellent opportunity for PhD students to present and discuss their work in a Workshop format. Each presentation will be evaluated by a panel composed by at least three Information Systems and Technologies experts.



Contributions Submission

The Doctoral Symposium is opened to PhD students whose research area includes the themes proposed for this Conference. Submissions must include an extended abstract (maximum 4 pages), following the Conference style guide <http://cisti.eu/2017/images/templates.zip>. All selected contributions will be handed out along with the Conference Proceedings, in CD with an ISBN. These contributions will be available in the IEEE Xplore <http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=7511893> Digital Library and will be sent for indexing in ISI, Scopus, EI-Compendex, INSPEC and Google Scholar.

Submissions must include the field, the PhD institution and the number of months devoted to the development of the work. Additionally, they should include in a clear and succinct manner:

    •    The problem approached and its significance or relevance
    •    The research objectives and related investigation topics
    •    A brief display of what is already known
    •    A proposed solution methodology for the problem
    •    Expected results




Important Dates

Paper submission: February 11, 2018

Notification of acceptance: March 18, 2018

Submission of accepted papers: March 30, 2018

Payment of registration, to ensure the inclusion of an accepted paper in the conference proceedings: April 1, 2018




Organizing Committee

Álvaro Rocha, Universidade de Coimbra

Manuel Pérez Cota, Universidad de Vigo



Scientific Committee

Manuel Pérez Cota, Universidad de Vigo (Chair)

Adolfo Lozano Tello, Universidad de Extremadura

Álvaro Rocha, Universidade de Coimbra

Ana Amélia Carvalho, Universidade de Coimbra

Ana Maria Ramalho Correia, Nova Information Management School

Antonio Garcia Loureiro, Universidad de Santiago de Compostela

António Lucas Soares, Universidade do Porto, FEUP

Arnaldo Martins, Universidade de Aveiro

Bráulio Alturas, Instituto Universitário de Lisboa (ISCTE-IUL)

Carlos Ferrás Sexto, Universidad de Santiago de Compostela

David Fonseca, La Salle, Universitat Ramon Llull

Ernest Redondo, Universidad Politécnica de Catalunya

Francisco Restivo, Universidade Católica Portuguesa

Gonçalo Paiva Dias, Universidade de Aveiro

Gonzalo Cuevas Agustin, Universidad Politécnica de Madrid

João Paulo Costa, Universidade de Coimbra

José Borbinha, INESC-ID, IST, Universidade de Lisboa

José Valença, Universidade do Minho

Jose Antonio Calvo-Manzano Villalón, Universidad Politécnica de Madrid

Juan Hernádez, Universidad de Extremadura

Luis Camarinha-Matos, Universidade Nova de Lisboa

Luís Paulo Reis, Universidade do Minho

Marco Painho, Nova Information Management School

Mário Piattini, Universidad de Castilla-La Mancha

Nelson Pacheco Rocha, Universidade de Aveiro

Ramiro Gonçalves, Universidade de Trás-os-Montes e Alto Douro





Doctoral Symposium webpage: http://cisti.eu/index.php?option=com_content&view=article&id=35&Itemid=119&lang=en


Kind regards,

CISTI'2018 Team
http://www.cisti.eu/


---
This email has been checked for viruses by AVG.
http://www.avg.com

[-- Attachment #1.2: Type: text/html, Size: 7685 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Alexander Duyck @ 2018-01-28 20:18 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Jakub Kicinski,
	Netdev, virtualization, Siwei Liu, David Miller
In-Reply-To: <f6737111-0015-007c-336b-c8c2793ee590@intel.com>

On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>
>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>
>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>
>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>>> slave virtio-net devices.
>>>>>
>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>> special
>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>
>>>> If the userspace is not expected to touch the slaves, then why do we
>>>> need to
>>>> take extra effort to expose a netdev that is just not really useful.
>>>
>>> You said:
>>> "[user space] needs to be aware of master and slave virtio-net devices."
>>>
>>> I'm saying:
>>> It has to be aware of the special arrangement whether there is an
>>> explicit bond netdev or not.
>>
>> To clarify here the kernel should be aware that there is a device that
>> is an aggregate of 2 other devices. It isn't as if we need to insert
>> the new device "above" the virtio.
>>
>> I have been doing a bit of messing around with a few ideas and it
>> seems like it would be better if we could replace the virtio interface
>> with the virtio-bond, renaming my virt-bond concept to this since it
>> is now supposedly going to live in the virtio driver, interface, and
>> then push the original virtio down one layer and call it a
>> virtio-backup. If I am not mistaken we could assign the same dev
>> pointer used by the virtio netdev to the virtio-bond, and if we
>> register it first with the "eth%d" name then udev will assume that the
>> virtio-bond device is the original virtio and all existing scripts
>> should just work with that. We then would want to change the name of
>> the virtio interface with the backup feature bit set, maybe call it
>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>> octets of the MAC address. It should solve the issue of inserting an
>> interface "above" the virtio by making the virtio-bond become the
>> virtio. The only limitation is that we will probably need to remove
>> the back-up if the virtio device is removed, however that was already
>> a limitation of this solution and others like the netvsc solution
>> anyway.
>
>
> With 3 netdev model, if we make the the master virtio-net associated with
> the
> real virtio pci device,  i think  the userspace scripts would not break.
> If we go this route, i am still not clear on the purpose of exposing the
> bkup netdev.
> Can we start with the 2 netdev model and move to 3 netdev model later if we
> find out that there are limitiations with the 2 netdev model? I don't think
> this will
> break any user API as the userspace is not expected to use the bkup netdev.

The 2 netdev model breaks a large number of expectations of user
space. Among them is XDP since we cannot guarantee a symmetric setup
between any entity and the virtio. How does it make sense that
enabling XDP on virtio shows zero Rx packets, and in the meantime you
are getting all of the packets coming in off of the VF?

In addition we would need to rewrite the VLAN and MAC address
filtering ndo operations since we likely cannot add any VLANs since in
most cases VFs are VLAN locked due to things like port VLAN and we
cannot change the MAC address since the whole bonding concept is built
around it.

The last bit is the netpoll packet routing which the current code
assumes is using the virtio only, but I don't know if that is a valid
assumption since the VF is expected to be the default route for
everything else when it is available.

The point is by the time you are done you will have rewritten pretty
much all the network device ops. With that being the case why add all
the code to virtio itself instead of just coming up with a brand new
set of ndo_ops that belong to this new device, and you could leave the
existing virtio code in place and save yourself a bunch of time by
just accessing it as an existing call as a separate netdev.

>>>>>> Also, from a user experience point of view, loading a virtio-net with
>>>>>> BACKUP feature enabled will now show 2 virtio-net netdevs.
>>>>>
>>>>> One virtio-net and one virtio-bond, which represents what's happening.
>>>>
>>>> This again assumes that we want to represent a bond setup. Can't we
>>>> treat this
>>>> as virtio-net providing an alternate low-latency datapath by taking over
>>>> VF datapath?
>>>
>>> Bond is just a familiar name, we can call it something else if you
>>> prefer.  The point is there are two data paths which can have
>>> independent low-level settings and a higher level entity with
>>> global settings which represents any path to the outside world.
>>>
>>> Hiding low-level netdevs from a lay user requires a generic solution.
>>
>> The last thing I think we should be doing is hiding the low level
>> netdevs. It doesn't solve anythying. We are already trusting the owner
>> of the VM enough to let them have root access of the VM. That means
>> they can load/unload any driver they want. They don't have to use the
>> kernel provided virtio driver, they could load their own. They could
>> even do something like run DPDK on top of it, or they could run DPDK
>> on top of the VF. In either case there is no way the bond would ever
>> be created and all hiding devices does is make it easier to fix
>> problems when something gets broken. Unless I am mistaken, and
>> "security through obscurity" has somehow become a valid security
>> model.
>>
>> As I mentioned to Sridhar on an off-list thread I think the goal
>> should be to make it so that the user wants to use the virtio-bond,
>> not make it so that they have no choice but to use it. The idea is we
>> should be making things easier for the administrator of the VM, not
>> harder.
>>
>>
> When the hypervisor has enabled BACKUP bit along with a VF with the same
> MAC address, the VM can use either VF only OR extended virtio with attached
> VF.
> Although there are 2 datapaths to the device, the hypervisor will enable
> only
> one at any time.  The virtio via PF datapath is only enabled during live
> migration
> when the VF is unplugged. If not, VF broadcasts/multicasts will get looped
> back
> to the VM via the PF datapath. In the RFC path i was assuming that the VM
> can
> use both datapaths and i had broadcasts/multicasts going over virtio
> datapath,
> but i don' think it is a good idea.  It requires the device to disable
> broadcast
> replication on the VFs.
>
> Thanks
> Sridhar

This would work well for the SwitchDev model, but not so well for the
legacy model supported by the Intel drivers. With SwitchDev the
broadcasts/multicasts are coming in on the upllink port representor
anyway is my understanding of how some of the implementations out
there are configured.

For now though we can't assume the logic in the path. We can do that
later when we start looking at v2 which would likely have us spawn a
new device that acts as some sort of smart PCI bridge/PCIe switch that
can then have that device assigned to the virtio-bond/bridge and can
hopefully start taking care of things like providing topology/switch
information and maybe incorporate some DMA tracking on the VF which
would probably be v3.

In the meantime for now I would say we keep this simple. We create a
virtio-bond device that does a bit of identity theft on the existing
virtio by mapping the same device, pushes the existing virtio to a
different name to hide it from udev to avoid naming confusion. We make
the device as dumb as can be and don't let it do any L2 configuration
such as assigning a MAC address or VLAN filter, we could probably flag
it as VLAN challenged. The XDP problem solves itself by us not
exposing any bpf or XDP operations. Then all that is left is dealing
with netpoll which if needed we can probably borrow the existing
approach from the bonding driver to solve.

Anyway that is the direction I see this going in.

Thanks.

- Alex

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-01-28 21:01 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Jakub Kicinski,
	Netdev, virtualization, Siwei Liu, David Miller
In-Reply-To: <CAKgT0UdaKtPe6982TuuGbxhhVgeehwS1aAp=s4sok2vKD6wMJg@mail.gmail.com>



On 1/28/2018 12:18 PM, Alexander Duyck wrote:
> On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>>>> slave virtio-net devices.
>>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>>> special
>>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>> If the userspace is not expected to touch the slaves, then why do we
>>>>> need to
>>>>> take extra effort to expose a netdev that is just not really useful.
>>>> You said:
>>>> "[user space] needs to be aware of master and slave virtio-net devices."
>>>>
>>>> I'm saying:
>>>> It has to be aware of the special arrangement whether there is an
>>>> explicit bond netdev or not.
>>> To clarify here the kernel should be aware that there is a device that
>>> is an aggregate of 2 other devices. It isn't as if we need to insert
>>> the new device "above" the virtio.
>>>
>>> I have been doing a bit of messing around with a few ideas and it
>>> seems like it would be better if we could replace the virtio interface
>>> with the virtio-bond, renaming my virt-bond concept to this since it
>>> is now supposedly going to live in the virtio driver, interface, and
>>> then push the original virtio down one layer and call it a
>>> virtio-backup. If I am not mistaken we could assign the same dev
>>> pointer used by the virtio netdev to the virtio-bond, and if we
>>> register it first with the "eth%d" name then udev will assume that the
>>> virtio-bond device is the original virtio and all existing scripts
>>> should just work with that. We then would want to change the name of
>>> the virtio interface with the backup feature bit set, maybe call it
>>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>>> octets of the MAC address. It should solve the issue of inserting an
>>> interface "above" the virtio by making the virtio-bond become the
>>> virtio. The only limitation is that we will probably need to remove
>>> the back-up if the virtio device is removed, however that was already
>>> a limitation of this solution and others like the netvsc solution
>>> anyway.
>>
>> With 3 netdev model, if we make the the master virtio-net associated with
>> the
>> real virtio pci device,  i think  the userspace scripts would not break.
>> If we go this route, i am still not clear on the purpose of exposing the
>> bkup netdev.
>> Can we start with the 2 netdev model and move to 3 netdev model later if we
>> find out that there are limitiations with the 2 netdev model? I don't think
>> this will
>> break any user API as the userspace is not expected to use the bkup netdev.
> The 2 netdev model breaks a large number of expectations of user
> space. Among them is XDP since we cannot guarantee a symmetric setup
> between any entity and the virtio. How does it make sense that
> enabling XDP on virtio shows zero Rx packets, and in the meantime you
> are getting all of the packets coming in off of the VF?
Sure we cannot support XDP in this model and it needs to be disabled.
>
> In addition we would need to rewrite the VLAN and MAC address
> filtering ndo operations since we likely cannot add any VLANs since in
> most cases VFs are VLAN locked due to things like port VLAN and we
> cannot change the MAC address since the whole bonding concept is built
> around it.
>
> The last bit is the netpoll packet routing which the current code
> assumes is using the virtio only, but I don't know if that is a valid
> assumption since the VF is expected to be the default route for
> everything else when it is available.
>
> The point is by the time you are done you will have rewritten pretty
> much all the network device ops. With that being the case why add all
> the code to virtio itself instead of just coming up with a brand new
> set of ndo_ops that belong to this new device, and you could leave the
> existing virtio code in place and save yourself a bunch of time by
> just accessing it as an existing call as a separate netdev.

When the BACKUP feature is enabled, we can simply disable most of these 
ndo ops
that cannot be supported. Not sure we need an additional netdev and ndo_ops.

When we can support all these usecases along with live migration we can move
to the 3 netdev model and i think we will need a new feature bit so that the
hypervisor can allow VM to use both datapaths and configure PF accordingly.

Thanks
Sridhar

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Stephen Hemminger @ 2018-01-28 23:02 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin,
	Samudrala, Sridhar, virtualization, Siwei Liu, Netdev,
	David Miller
In-Reply-To: <20180126183003.591cd5c5@cakuba.netronome.com>

On Fri, 26 Jan 2018 18:30:03 -0800
Jakub Kicinski <kubakici@wp.pl> wrote:

> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
> > On 1/26/2018 2:47 PM, Jakub Kicinski wrote:  
> > > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:    
> > >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:    
> > >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
> > >>>> directly,
> > >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > >>>> and vf) should
> > >>>> work fine.    
> > >>> OK. For your use case that's fine. But that's too specific scenario
> > >>> with lots of restrictions IMHO, perhaps very few users will benefit
> > >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
> > >>> take this one and come back with a generic solution that is able to
> > >>> address general use cases for VF/PT live migration .    
> > >> I think that's a fine approach. Scratch your own itch!  I imagine a very
> > >> generic virtio-switchdev providing host routing info to guests could
> > >> address lots of usecases. A driver could bind to that one and enslave
> > >> arbitrary other devices.  Sounds reasonable.
> > >>
> > >> But given the fundamental idea of a failover was floated at least as
> > >> early as 2013, and made 0 progress since precisely because it kept
> > >> trying to address more and more features, and given netvsc is already
> > >> using the basic solution with some success, I'm not inclined to block
> > >> this specific effort waiting for the generic one.    
> > > I think there is an agreement that the extra netdev will be useful for
> > > more advanced use cases, and is generally preferable.  What is the
> > > argument for not doing that from the start?  If it was made I must have
> > > missed it.  Is it just unwillingness to write the extra 300 lines of
> > > code?  Sounds like a pretty weak argument when adding kernel ABI is at
> > > stake...    
> > 
> > I am still not clear on the need for the extra netdev created by 
> > virtio_net. The only advantage i can see is that the stats can be
> > broken between VF and virtio datapaths compared to the aggregrated
> > stats on virtio netdev as seen with the 2 netdev approach.  
> 
> Maybe you're not convinced but multiple arguments were made.
> 
> > With 2 netdev model, any VM image that has a working network 
> > configuration will transparently get VF based acceleration without
> > any changes.   
> 
> Nothing happens transparently.  Things may happen automatically.  The
> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
> something it did not use to be.  And configures and reports some
> information from the PV (e.g. speed) but PV doesn't pass traffic any
> longer.
> 
> > 3 netdev model breaks this configuration starting with the creation
> > and naming of the 2 devices to udev needing to be aware of master and
> > slave virtio-net devices.  
> 
> I don't understand this comment.  There is one virtio-net device and
> one "virtio-bond" netdev.  And user space has to be aware of the special
> automatic arrangement anyway, because it can't touch the VF.  It
> doesn't make any difference whether it ignores the VF or PV and VF.
> It simply can't touch the slaves, no matter how many there are.
> 
> > Also, from a user experience point of view, loading a virtio-net with
> > BACKUP feature enabled will now show 2 virtio-net netdevs.  
> 
> One virtio-net and one virtio-bond, which represents what's happening.
> 
> > For live migration with advanced usecases that Siwei is suggesting, i 
> > think we need a new driver with a new device type that can track the
> > VF specific feature settings even when the VF driver is unloaded.  

I see no added value of the 3 netdev model, there is no need for a bond
device.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Alexander Duyck @ 2018-01-29  0:58 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Jakub Kicinski,
	Netdev, virtualization, Siwei Liu, David Miller
In-Reply-To: <b6bf20f8-8881-4d25-db4c-24f93d5e6cba@intel.com>

On Sun, Jan 28, 2018 at 1:01 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 1/28/2018 12:18 PM, Alexander Duyck wrote:
>>
>> On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
>> <sridhar.samudrala@intel.com> wrote:
>>>
>>> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>>>
>>>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>>>
>>>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>>>
>>>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>>>> and naming of the 2 devices to udev needing to be aware of master
>>>>>>>> and
>>>>>>>> slave virtio-net devices.
>>>>>>>
>>>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>>>> special
>>>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>>>
>>>>>> If the userspace is not expected to touch the slaves, then why do we
>>>>>> need to
>>>>>> take extra effort to expose a netdev that is just not really useful.
>>>>>
>>>>> You said:
>>>>> "[user space] needs to be aware of master and slave virtio-net
>>>>> devices."
>>>>>
>>>>> I'm saying:
>>>>> It has to be aware of the special arrangement whether there is an
>>>>> explicit bond netdev or not.
>>>>
>>>> To clarify here the kernel should be aware that there is a device that
>>>> is an aggregate of 2 other devices. It isn't as if we need to insert
>>>> the new device "above" the virtio.
>>>>
>>>> I have been doing a bit of messing around with a few ideas and it
>>>> seems like it would be better if we could replace the virtio interface
>>>> with the virtio-bond, renaming my virt-bond concept to this since it
>>>> is now supposedly going to live in the virtio driver, interface, and
>>>> then push the original virtio down one layer and call it a
>>>> virtio-backup. If I am not mistaken we could assign the same dev
>>>> pointer used by the virtio netdev to the virtio-bond, and if we
>>>> register it first with the "eth%d" name then udev will assume that the
>>>> virtio-bond device is the original virtio and all existing scripts
>>>> should just work with that. We then would want to change the name of
>>>> the virtio interface with the backup feature bit set, maybe call it
>>>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>>>> octets of the MAC address. It should solve the issue of inserting an
>>>> interface "above" the virtio by making the virtio-bond become the
>>>> virtio. The only limitation is that we will probably need to remove
>>>> the back-up if the virtio device is removed, however that was already
>>>> a limitation of this solution and others like the netvsc solution
>>>> anyway.
>>>
>>>
>>> With 3 netdev model, if we make the the master virtio-net associated with
>>> the
>>> real virtio pci device,  i think  the userspace scripts would not break.
>>> If we go this route, i am still not clear on the purpose of exposing the
>>> bkup netdev.
>>> Can we start with the 2 netdev model and move to 3 netdev model later if
>>> we
>>> find out that there are limitiations with the 2 netdev model? I don't
>>> think
>>> this will
>>> break any user API as the userspace is not expected to use the bkup
>>> netdev.
>>
>> The 2 netdev model breaks a large number of expectations of user
>> space. Among them is XDP since we cannot guarantee a symmetric setup
>> between any entity and the virtio. How does it make sense that
>> enabling XDP on virtio shows zero Rx packets, and in the meantime you
>> are getting all of the packets coming in off of the VF?
>
> Sure we cannot support XDP in this model and it needs to be disabled.
>>
>>
>> In addition we would need to rewrite the VLAN and MAC address
>> filtering ndo operations since we likely cannot add any VLANs since in
>> most cases VFs are VLAN locked due to things like port VLAN and we
>> cannot change the MAC address since the whole bonding concept is built
>> around it.
>>
>> The last bit is the netpoll packet routing which the current code
>> assumes is using the virtio only, but I don't know if that is a valid
>> assumption since the VF is expected to be the default route for
>> everything else when it is available.
>>
>> The point is by the time you are done you will have rewritten pretty
>> much all the network device ops. With that being the case why add all
>> the code to virtio itself instead of just coming up with a brand new
>> set of ndo_ops that belong to this new device, and you could leave the
>> existing virtio code in place and save yourself a bunch of time by
>> just accessing it as an existing call as a separate netdev.
>
>
> When the BACKUP feature is enabled, we can simply disable most of these ndo
> ops
> that cannot be supported. Not sure we need an additional netdev and ndo_ops.

The point is feature bits have to be changed, and network
device/ethtool ops have to be rewritten. At that point you have
essentially forked the virtio interface. What I don't understand is
what your resistance is to doing this as 3 netdevs? I thought the
original issue was the fact that the bond-like interface would come up
with a different name/ID. I think we have a solution for that at this
point. I think the bigger issue is that this 2 netdev approach is far
to invasive to be done in the virtio driver itself and would require
massive changes to it. It makes more sense to just put together a
small new netdev that basically does the "dumb-bond" solution.

> When we can support all these usecases along with live migration we can move
> to the 3 netdev model and i think we will need a new feature bit so that the
> hypervisor can allow VM to use both datapaths and configure PF accordingly.

Right now you aren't supporting any of the use cases you claim to. The
code as it stands is full of corner cases. I have already pointed out
several holes in the code that were overlooked that are going to sink
the whole concept if we were to try to push something like this into
production.

On Thursday I tried seeing how hard it would be to make bonding fit
into the model we need. The bonding driver is carrying a TON of extra
bloat and technical debt that we don't need at this point since it has
all the MII/ARP code, sysfs, procfs, debugfs, notifiers, and modes
that we don't want. Trying to cut out that code and make something
that would be usable for this is probably going to be too expensive in
the long run. In the meantime though I think I have developed an
appreciation of where both solutions are currently lacking.

I think implementing another netdev at this point is the only way to
go forward, otherwise what is going to happen is that virtio is going
to quickly bloat up with a bunch of code that has to disable almost
everything in it if BACKUP is enabled as a feature. So here are the
list of netdev ops for virtio:
        .ndo_open            = virtnet_open,
        .ndo_stop            = virtnet_close,

I assume these first 2 will need some changes. I suspect there would
be threads responsible for doing things like maintaining the link
statistics that need to be collected periodically. Also I don't know
if you want to keep the slaves up if the master is up or not. In
addition I don't see any code in your current solution that calls
dev_open. I am assuming something was probably bringing up the
interface for you and then you enslaved it instead of the virtio
interface bringing up the interface itself.

        .ndo_start_xmit      = start_xmit,

Your original patch included a small set of changes for this function.
One thing you appear to have overlooked was the fact that
qdisc_skb_cb(skb)->slave_dev_queue_mapping relies on ndo_select_queue.
It means you are missing at least one new ndo_op that is not currently
provided by virtio. In addition we may want to look at providing a
change of the queue mapping for the combined interface since the VF
may support more queues than the virtio currently does. In my mind we
probably don't even need a qdisc for the upper level device anyway. We
could probably improve our performance if we could make the device not
have a qdisc and even more if we could make it run with LLTX since the
lower device has both covered for us already.

        .ndo_validate_addr   = eth_validate_addr,
        .ndo_set_mac_address = virtnet_set_mac_address,
        .ndo_set_rx_mode     = virtnet_set_rx_mode,

I already mentioned most of the L2 address/filtering code would
probably have to go since the VF drivers aren't usually allowed to do
things like go into full promiscuous or change their MAC address
anyway, and we don't want the address of a MAC address based bond to
be changed.

        .ndo_get_stats64     = virtnet_stats,

You already modified this in your first rev of the patch.  Your
solution is okay I suppose, though it might be more interesting if you
could just provide a merged version of the stats of the 2 interfaces
instead of just capturing some of the Tx/Rx stats.

        .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,

These can both go since we could assume the combined interface is more
than likely "VLAN challenged" due to the fact that the primary use
case is VF migration. Otherwise we need to be able to guarantee that
any VLAN added to the interface is present on both the VF and the
virtio. Since that is extra work just calling the interface "VLAN
challenged" for now is easier.

#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller = virtnet_netpoll,
#endif

The polling logic needs to be updated so that the polling follows the
Tx packet path instead of just assuming everything is running through
virtio since a backup interface may or may not be capable of
transmitting packets when not in use.

        .ndo_bpf                = virtnet_xdp,
        .ndo_xdp_xmit           = virtnet_xdp_xmit,
        .ndo_xdp_flush          = virtnet_xdp_flush,

This whole block would have to be dropped for the combined interface.
That way we will at least get symmetric behavior between a VF and the
virtio data paths.

        .ndo_features_check     = passthru_features_check,

This part can mostly stay, but as I said the feature bits are going to
have to significantly change for the interface that is sitting on top
of the VF and virtio since the feature functionality likely differs
quite a bit.



I would go through the ethtool ops, but the fact is most of them just
don't apply. All bond implements is the get_drvinfo, get_link, and
get_link_ksettings and that is probably all that would be needed for
the virtio-bond interface.



Hopefully that clarifies things. In my mind the 2 netdev approach
would be the approach to consider for later. For now it is probably
easier to just have the virtio-bond, virtio-backup, and the VF all
present as it makes it easier to get away with just reusing existing
code since all the needed ops are exposed by the netdevs. Hiding them
is going to make things more difficult for debugging so I am strongly
opposed to hiding things.

I realize I told you I was willing to disagree and commit on this, but
after spending a full day reviewing the solution proposed and spending
some time crawling through the bonding code to get a better grasp of
things I am going to have to say the 2 netdev approach just isn't
feasible. We need to have 3 separate netdevs exposing 3 separate sets
of ndo/ethtool ops in order for things to work correctly. Anything
else is going to end up severely impacting features and/or
performance.

- Alex

^ permalink raw reply

* Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Jason Wang @ 2018-01-29  3:32 UTC (permalink / raw)
  To: Samudrala, Sridhar, mst, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici
In-Reply-To: <f6d80eca-8f95-861c-6717-7df53af870fe@intel.com>



On 2018年01月24日 00:03, Samudrala, Sridhar wrote:
>
>
> On 1/23/2018 2:33 AM, Jason Wang wrote:
>>
>>
>> On 2018年01月12日 13:58, Sridhar Samudrala wrote:
>>>   static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
>>> net_device *dev)
>>>   {
>>>       struct virtnet_info *vi = netdev_priv(dev);
>>>       int qnum = skb_get_queue_mapping(skb);
>>>       struct send_queue *sq = &vi->sq[qnum];
>>> +    struct net_device *vf_netdev;
>>>       int err;
>>>       struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>>>       bool kick = !skb->xmit_more;
>>>       bool use_napi = sq->napi.weight;
>>>   +    /* If VF is present and up then redirect packets
>>> +     * called with rcu_read_lock_bh
>>> +     */
>>> +    vf_netdev = rcu_dereference_bh(vi->vf_netdev);
>>> +    if (vf_netdev && netif_running(vf_netdev) &&
>>> +        !netpoll_tx_running(dev) &&
>>> +        is_unicast_ether_addr(eth_hdr(skb)->h_dest))
>>> +        return virtnet_vf_xmit(dev, vf_netdev, skb);
>>> +
>>
>> A question here.
>>
>> If I read the code correctly, all features were validated against 
>> virtio instead VF before transmitting. This assumes VF's feature is a 
>> superset of virtio, does this really work? Do we need to sanitize the 
>> feature before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be 
>> removed).
>
> Actually, virtnet_vf_xmit() calls dev_queue_xmit() after updating 
> skb->dev to vf netdev.
> So the features get validated against VF features and the right tx 
> queue is selected
> before the real transmit.

I see, the the packet will go through qdiscs twice which is suboptimal. 
And the device may lose some ability of VF like tunnel GSO.

Thanks

>
> Thanks
> Sridhar
>
>
>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [virtio-dev] Re: [RFC PATCH net-next v2 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Alexander Duyck @ 2018-01-29  4:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Alexander Duyck, virtio-dev, Michael S. Tsirkin, Jakub Kicinski,
	Samudrala, Sridhar, virtualization, Siwei Liu, Netdev,
	David Miller
In-Reply-To: <20180128150237.35237d84@xeon-e3>

On Sun, Jan 28, 2018 at 3:02 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Fri, 26 Jan 2018 18:30:03 -0800
> Jakub Kicinski <kubakici@wp.pl> wrote:
>
>> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
>> > On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
>> > > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>> > >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>> > >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>> > >>>> directly,
>> > >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>> > >>>> and vf) should
>> > >>>> work fine.
>> > >>> OK. For your use case that's fine. But that's too specific scenario
>> > >>> with lots of restrictions IMHO, perhaps very few users will benefit
>> > >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>> > >>> take this one and come back with a generic solution that is able to
>> > >>> address general use cases for VF/PT live migration .
>> > >> I think that's a fine approach. Scratch your own itch!  I imagine a very
>> > >> generic virtio-switchdev providing host routing info to guests could
>> > >> address lots of usecases. A driver could bind to that one and enslave
>> > >> arbitrary other devices.  Sounds reasonable.
>> > >>
>> > >> But given the fundamental idea of a failover was floated at least as
>> > >> early as 2013, and made 0 progress since precisely because it kept
>> > >> trying to address more and more features, and given netvsc is already
>> > >> using the basic solution with some success, I'm not inclined to block
>> > >> this specific effort waiting for the generic one.
>> > > I think there is an agreement that the extra netdev will be useful for
>> > > more advanced use cases, and is generally preferable.  What is the
>> > > argument for not doing that from the start?  If it was made I must have
>> > > missed it.  Is it just unwillingness to write the extra 300 lines of
>> > > code?  Sounds like a pretty weak argument when adding kernel ABI is at
>> > > stake...
>> >
>> > I am still not clear on the need for the extra netdev created by
>> > virtio_net. The only advantage i can see is that the stats can be
>> > broken between VF and virtio datapaths compared to the aggregrated
>> > stats on virtio netdev as seen with the 2 netdev approach.
>>
>> Maybe you're not convinced but multiple arguments were made.
>>
>> > With 2 netdev model, any VM image that has a working network
>> > configuration will transparently get VF based acceleration without
>> > any changes.
>>
>> Nothing happens transparently.  Things may happen automatically.  The
>> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
>> something it did not use to be.  And configures and reports some
>> information from the PV (e.g. speed) but PV doesn't pass traffic any
>> longer.
>>
>> > 3 netdev model breaks this configuration starting with the creation
>> > and naming of the 2 devices to udev needing to be aware of master and
>> > slave virtio-net devices.
>>
>> I don't understand this comment.  There is one virtio-net device and
>> one "virtio-bond" netdev.  And user space has to be aware of the special
>> automatic arrangement anyway, because it can't touch the VF.  It
>> doesn't make any difference whether it ignores the VF or PV and VF.
>> It simply can't touch the slaves, no matter how many there are.
>>
>> > Also, from a user experience point of view, loading a virtio-net with
>> > BACKUP feature enabled will now show 2 virtio-net netdevs.
>>
>> One virtio-net and one virtio-bond, which represents what's happening.
>>
>> > For live migration with advanced usecases that Siwei is suggesting, i
>> > think we need a new driver with a new device type that can track the
>> > VF specific feature settings even when the VF driver is unloaded.
>
> I see no added value of the 3 netdev model, there is no need for a bond
> device.

I agree a full-blown bond isn't what is needed. However, just forking
traffic out from virtio to a VF doesn't really solve things either.

One of the issues as I see it is the fact that the qdisc model with
the merged device gets to be pretty ugly. There is the fact that every
packet that goes to the VF has to pass through the qdisc code twice.
There is the dual nature of the 2 netdev solution that also introduces
the potential for head-of-line blocking since the virtio could put
back pressure on the upper qdisc layer which could stall the VF
traffic when switching over. I hope we could avoid issues like that by
maintaining qdiscs per device queue instead of on an upper device that
is half software interface and half not. Ideally the virtio-bond
device could operate without a qdisc and without needing any
additional locks so there shouldn't be head of line blocking occurring
between the two interfaces and overhead could be kept minimal.

Also in the case of virtio there is support for in-driver XDP. As
Sridhar stated, when using the 2 netdev model "we cannot support XDP
in this model and it needs to be disabled". That sounds like a step
backwards instead of forwards. I would much rather leave the XDP
enabled at the lower dev level, and then if we want we can use the
generic XDP at the virtio-bond level to capture traffic on both
interfaces at the same time.

In the case of netvsc you have control of both sides of a given link
so you can match up the RSS tables, queue configuration and everything
is somewhat symmetric since you are running the PF and all the HyperV
subchannels. Most of the complexity is pushed down into the host and
your subchannel management is synchronized there if I am not mistaken.
We don't have this in the case of this virtio-bond setup. Instead a
single bit is set indicating "backup" without indicating what that
means to topology other than the fact that this virtio interface is
the backup for some other interface. We are essentially blind other
than having the link status for the VF and virtio and knowing that the
virtio is intended to be the backup.

We might be able to get to a 2 or maybe even a 1 netdev solution at
some point in the future, but  I don't think that time is now. For now
a virtio-bond type solution would allow us to address most of the use
cases with minimal modification to the existing virtio and can deal
with feature and/or resource asymmetry.

- Alex

^ permalink raw reply

* Re: [PATCH net] vhost_net: stop device during reset owner
From: David Miller @ 2018-01-29 17:35 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1516889032-23510-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Thu, 25 Jan 2018 22:03:52 +0800

> We don't stop device before reset owner, this means we could try to
> serve any virtqueue kick before reset dev->worker. This will result a
> warn since the work was pending at llist during owner resetting. Fix
> this by stopping device during owner reset.
> 
> Reported-by: syzbot+eb17c6162478cc50632c@syzkaller.appspotmail.com
> Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied and queued up for -stable.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox