* [RFC PATCH v3 3/3] virtio_net: Enable alternate datapath without creating an additional netdev
From: Sridhar Samudrala @ 2018-02-16 18:11 UTC (permalink / raw)
To: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
jasowang, loseweigh
In-Reply-To: <1518804682-16881-1-git-send-email-sridhar.samudrala@intel.com>
This patch addresses the issues that were seen with the 3 netdev model by
avoiding the creation of an additional netdev. Instead the bypass state
information is tracked in the original netdev and a different set of
ndo_ops and ethtool_ops are used when BACKUP feature is enabled.
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
drivers/net/virtio_net.c | 283 +++++++++++++++++------------------------------
1 file changed, 101 insertions(+), 182 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 14679806c1b1..c85b2949f151 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -154,7 +154,7 @@ struct virtnet_bypass_info {
struct net_device __rcu *active_netdev;
/* virtio_net netdev */
- struct net_device __rcu *backup_netdev;
+ struct net_device *backup_netdev;
/* active netdev stats */
struct rtnl_link_stats64 active_stats;
@@ -229,8 +229,8 @@ struct virtnet_info {
unsigned long guest_offloads;
- /* upper netdev created when BACKUP feature enabled */
- struct net_device *bypass_netdev;
+ /* bypass state maintained when BACKUP feature is enabled */
+ struct virtnet_bypass_info *vbi;
};
struct padded_vnet_hdr {
@@ -2285,6 +2285,22 @@ static bool virtnet_bypass_xmit_ready(struct net_device *dev)
return netif_running(dev) && netif_carrier_ok(dev);
}
+static bool virtnet_bypass_active_ready(struct net_device *dev)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
+ struct net_device *active;
+
+ if (!vbi)
+ return false;
+
+ active = rcu_dereference(vbi->active_netdev);
+ if (!active || !virtnet_bypass_xmit_ready(active))
+ return false;
+
+ return true;
+}
+
static void virtnet_config_changed_work(struct work_struct *work)
{
struct virtnet_info *vi =
@@ -2312,7 +2328,7 @@ static void virtnet_config_changed_work(struct work_struct *work)
virtnet_update_settings(vi);
netif_carrier_on(vi->dev);
netif_tx_wake_all_queues(vi->dev);
- } else {
+ } else if (!virtnet_bypass_active_ready(vi->dev)) {
netif_carrier_off(vi->dev);
netif_tx_stop_all_queues(vi->dev);
}
@@ -2501,7 +2517,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
if (vi->has_cvq) {
vi->cvq = vqs[total_vqs - 1];
- if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
+ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN) &&
+ !virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
}
@@ -2690,62 +2707,54 @@ virtnet_bypass_child_open(struct net_device *dev,
static int virtnet_bypass_open(struct net_device *dev)
{
- struct virtnet_bypass_info *vbi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
-
- netif_carrier_off(dev);
- netif_tx_wake_all_queues(dev);
+ int err;
child_netdev = rtnl_dereference(vbi->active_netdev);
if (child_netdev)
virtnet_bypass_child_open(dev, child_netdev);
- child_netdev = rtnl_dereference(vbi->backup_netdev);
- if (child_netdev)
- virtnet_bypass_child_open(dev, child_netdev);
+ err = virtnet_open(dev);
+ if (err < 0) {
+ dev_close(child_netdev);
+ return err;
+ }
return 0;
}
static int virtnet_bypass_close(struct net_device *dev)
{
- struct virtnet_bypass_info *vi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
- netif_tx_disable(dev);
+ virtnet_close(dev);
- child_netdev = rtnl_dereference(vi->active_netdev);
- if (child_netdev)
- dev_close(child_netdev);
+ if (!vbi)
+ goto done;
- child_netdev = rtnl_dereference(vi->backup_netdev);
+ child_netdev = rtnl_dereference(vbi->active_netdev);
if (child_netdev)
dev_close(child_netdev);
+done:
return 0;
}
-static netdev_tx_t
-virtnet_bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- atomic_long_inc(&dev->tx_dropped);
- dev_kfree_skb_any(skb);
- return NETDEV_TX_OK;
-}
-
static netdev_tx_t
virtnet_bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct virtnet_bypass_info *vbi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *xmit_dev;
/* Try xmit via active netdev followed by backup netdev */
xmit_dev = rcu_dereference_bh(vbi->active_netdev);
- if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev)) {
- xmit_dev = rcu_dereference_bh(vbi->backup_netdev);
- if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
- return virtnet_bypass_drop_xmit(skb, dev);
- }
+ if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
+ return start_xmit(skb, dev);
skb->dev = xmit_dev;
skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
@@ -2810,7 +2819,8 @@ static void
virtnet_bypass_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
- struct virtnet_bypass_info *vbi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
const struct rtnl_link_stats64 *new;
struct rtnl_link_stats64 temp;
struct net_device *child_netdev;
@@ -2827,12 +2837,10 @@ virtnet_bypass_get_stats(struct net_device *dev,
memcpy(&vbi->active_stats, new, sizeof(*new));
}
- child_netdev = rcu_dereference(vbi->backup_netdev);
- if (child_netdev) {
- new = dev_get_stats(child_netdev, &temp);
- virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
- memcpy(&vbi->backup_stats, new, sizeof(*new));
- }
+ memset(&temp, 0, sizeof(temp));
+ virtnet_stats(vbi->backup_netdev, &temp);
+ virtnet_bypass_fold_stats(stats, &temp, &vbi->backup_stats);
+ memcpy(&vbi->backup_stats, &temp, sizeof(temp));
rcu_read_unlock();
@@ -2842,7 +2850,8 @@ virtnet_bypass_get_stats(struct net_device *dev,
static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
{
- struct virtnet_bypass_info *vbi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
int ret = 0;
@@ -2853,15 +2862,6 @@ static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
return ret;
}
- child_netdev = rcu_dereference(vbi->backup_netdev);
- if (child_netdev) {
- ret = dev_set_mtu(child_netdev, new_mtu);
- if (ret)
- netdev_err(child_netdev,
- "Unexpected failure to set mtu to %d\n",
- new_mtu);
- }
-
dev->mtu = new_mtu;
return 0;
}
@@ -2881,20 +2881,13 @@ static int
virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd)
{
- struct virtnet_bypass_info *vbi = netdev_priv(dev);
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
child_netdev = rtnl_dereference(vbi->active_netdev);
- if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
- child_netdev = rtnl_dereference(vbi->backup_netdev);
- if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
- cmd->base.duplex = DUPLEX_UNKNOWN;
- cmd->base.port = PORT_OTHER;
- cmd->base.speed = SPEED_UNKNOWN;
-
- return 0;
- }
- }
+ if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev))
+ return virtnet_get_link_ksettings(dev, cmd);
return __ethtool_get_link_ksettings(child_netdev, cmd);
}
@@ -2944,14 +2937,15 @@ get_virtnet_bypass_byref(struct net_device *child_netdev)
for_each_netdev(net, dev) {
struct virtnet_bypass_info *vbi;
+ struct virtnet_info *vi;
if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
continue; /* not a virtnet_bypass device */
- vbi = netdev_priv(dev);
+ vi = netdev_priv(dev);
+ vbi = vi->vbi;
- if ((rtnl_dereference(vbi->active_netdev) == child_netdev) ||
- (rtnl_dereference(vbi->backup_netdev) == child_netdev))
+ if (rtnl_dereference(vbi->active_netdev) == child_netdev)
return dev; /* a match */
}
@@ -2974,9 +2968,9 @@ static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
static int virtnet_bypass_register_child(struct net_device *child_netdev)
{
+ struct net_device *dev, *active;
struct virtnet_bypass_info *vbi;
- struct net_device *dev;
- bool backup;
+ struct virtnet_info *vi;
int ret;
if (child_netdev->addr_len != ETH_ALEN)
@@ -2991,14 +2985,14 @@ static int virtnet_bypass_register_child(struct net_device *child_netdev)
if (!dev)
return NOTIFY_DONE;
- vbi = netdev_priv(dev);
- backup = (child_netdev->dev.parent == dev->dev.parent);
- if (backup ? rtnl_dereference(vbi->backup_netdev) :
- rtnl_dereference(vbi->active_netdev)) {
+ vi = netdev_priv(dev);
+ vbi = vi->vbi;
+
+ active = rtnl_dereference(vbi->active_netdev);
+ if (active) {
netdev_info(dev,
"%s attempting to join bypass dev when %s already present\n",
- child_netdev->name,
- backup ? "backup" : "active");
+ child_netdev->name, active->name);
return NOTIFY_DONE;
}
@@ -3030,7 +3024,7 @@ static int virtnet_bypass_register_child(struct net_device *child_netdev)
}
}
- /* Align MTU of child with master */
+ /* Align MTU of child with virtio */
ret = dev_set_mtu(child_netdev, dev->mtu);
if (ret) {
netdev_err(dev,
@@ -3044,15 +3038,10 @@ static int virtnet_bypass_register_child(struct net_device *child_netdev)
netdev_info(dev, "registering %s\n", child_netdev->name);
dev_hold(child_netdev);
- if (backup) {
- rcu_assign_pointer(vbi->backup_netdev, child_netdev);
- dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
- } else {
- rcu_assign_pointer(vbi->active_netdev, child_netdev);
- dev_get_stats(vbi->active_netdev, &vbi->active_stats);
- dev->min_mtu = child_netdev->min_mtu;
- dev->max_mtu = child_netdev->max_mtu;
- }
+ rcu_assign_pointer(vbi->active_netdev, child_netdev);
+ dev_get_stats(vbi->active_netdev, &vbi->active_stats);
+ dev->min_mtu = child_netdev->min_mtu;
+ dev->max_mtu = child_netdev->max_mtu;
return NOTIFY_OK;
@@ -3070,13 +3059,15 @@ static int virtnet_bypass_register_child(struct net_device *child_netdev)
static int virtnet_bypass_unregister_child(struct net_device *child_netdev)
{
struct virtnet_bypass_info *vbi;
- struct net_device *dev, *backup;
+ struct virtnet_info *vi;
+ struct net_device *dev;
dev = get_virtnet_bypass_byref(child_netdev);
if (!dev)
return NOTIFY_DONE;
- vbi = netdev_priv(dev);
+ vi = netdev_priv(dev);
+ vbi = vi->vbi;
netdev_info(dev, "unregistering %s\n", child_netdev->name);
@@ -3084,41 +3075,35 @@ static int virtnet_bypass_unregister_child(struct net_device *child_netdev)
netdev_upper_dev_unlink(child_netdev, dev);
child_netdev->flags &= ~IFF_SLAVE;
- if (child_netdev->dev.parent == dev->dev.parent) {
- RCU_INIT_POINTER(vbi->backup_netdev, NULL);
- } else {
- RCU_INIT_POINTER(vbi->active_netdev, NULL);
- backup = rtnl_dereference(vbi->backup_netdev);
- if (backup) {
- dev->min_mtu = backup->min_mtu;
- dev->max_mtu = backup->max_mtu;
- }
- }
+ RCU_INIT_POINTER(vbi->active_netdev, NULL);
+ dev->min_mtu = MIN_MTU;
+ dev->max_mtu = MAX_MTU;
dev_put(child_netdev);
+ if (!(vi->status & VIRTIO_NET_S_LINK_UP)) {
+ netif_carrier_off(dev);
+ netif_tx_stop_all_queues(dev);
+ }
+
return NOTIFY_OK;
}
static int virtnet_bypass_update_link(struct net_device *child_netdev)
{
- struct net_device *dev, *active, *backup;
- struct virtnet_bypass_info *vbi;
+ struct virtnet_info *vi;
+ struct net_device *dev;
dev = get_virtnet_bypass_byref(child_netdev);
- if (!dev || !netif_running(dev))
+ if (!dev)
return NOTIFY_DONE;
- vbi = netdev_priv(dev);
-
- active = rtnl_dereference(vbi->active_netdev);
- backup = rtnl_dereference(vbi->backup_netdev);
+ vi = netdev_priv(dev);
- if ((active && virtnet_bypass_xmit_ready(active)) ||
- (backup && virtnet_bypass_xmit_ready(backup))) {
+ if (virtnet_bypass_xmit_ready(child_netdev)) {
netif_carrier_on(dev);
netif_tx_wake_all_queues(dev);
- } else {
+ } else if (!(vi->status & VIRTIO_NET_S_LINK_UP)) {
netif_carrier_off(dev);
netif_tx_stop_all_queues(dev);
}
@@ -3169,107 +3154,41 @@ static struct notifier_block virtnet_bypass_notifier = {
static int virtnet_bypass_create(struct virtnet_info *vi)
{
- struct net_device *backup_netdev = vi->dev;
- struct device *dev = &vi->vdev->dev;
- struct net_device *bypass_netdev;
- int res;
+ struct net_device *dev = vi->dev;
+ struct virtnet_bypass_info *vbi;
- /* Alloc at least 2 queues, for now we are going with 16 assuming
- * that most devices being bonded won't have too many queues.
- */
- bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
- 16);
- if (!bypass_netdev) {
- dev_err(dev, "Unable to allocate bypass_netdev!\n");
+ vbi = kzalloc(sizeof(*vbi), GFP_KERNEL);
+ if (!vbi)
return -ENOMEM;
- }
-
- dev_net_set(bypass_netdev, dev_net(backup_netdev));
- SET_NETDEV_DEV(bypass_netdev, dev);
-
- bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
- bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
-
- /* Initialize the device options */
- bypass_netdev->flags |= IFF_MASTER;
- bypass_netdev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT |
- IFF_NO_QUEUE;
- bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
- IFF_TX_SKB_SHARING);
-
- /* don't acquire bypass netdev's netif_tx_lock when transmitting */
- bypass_netdev->features |= NETIF_F_LLTX;
-
- /* Don't allow bypass devices to change network namespaces. */
- bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
-
- bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
- NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
- NETIF_F_HIGHDMA | NETIF_F_LRO;
-
- bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
- bypass_netdev->features |= bypass_netdev->hw_features;
-
- /* For now treat bypass netdev as VLAN challenged since we
- * cannot assume VLAN functionality with a VF
- */
- bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
-
- memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
- bypass_netdev->addr_len);
- bypass_netdev->min_mtu = backup_netdev->min_mtu;
- bypass_netdev->max_mtu = backup_netdev->max_mtu;
+ dev->netdev_ops = &virtnet_bypass_netdev_ops;
+ dev->ethtool_ops = &virtnet_bypass_ethtool_ops;
- res = register_netdev(bypass_netdev);
- if (res < 0) {
- dev_err(dev, "Unable to register bypass_netdev!\n");
- free_netdev(bypass_netdev);
- return res;
- }
-
- netif_carrier_off(bypass_netdev);
-
- vi->bypass_netdev = bypass_netdev;
-
- /* Change the name of the backup interface to vbkup0
- * we may need to revisit naming later but this gets it out
- * of the way for now.
- */
- strcpy(backup_netdev->name, "vbkup%d");
+ vbi->backup_netdev = dev;
+ virtnet_stats(vbi->backup_netdev, &vbi->backup_stats);
+ vi->vbi = vbi;
return 0;
}
static void virtnet_bypass_destroy(struct virtnet_info *vi)
{
- struct net_device *bypass_netdev = vi->bypass_netdev;
- struct virtnet_bypass_info *vbi;
+ struct virtnet_bypass_info *vbi = vi->vbi;
struct net_device *child_netdev;
- /* no device found, nothing to free */
- if (!bypass_netdev)
+ if (!vbi)
return;
- vbi = netdev_priv(bypass_netdev);
-
- netif_device_detach(bypass_netdev);
-
rtnl_lock();
child_netdev = rtnl_dereference(vbi->active_netdev);
if (child_netdev)
virtnet_bypass_unregister_child(child_netdev);
- child_netdev = rtnl_dereference(vbi->backup_netdev);
- if (child_netdev)
- virtnet_bypass_unregister_child(child_netdev);
-
- unregister_netdevice(bypass_netdev);
-
rtnl_unlock();
- free_netdev(bypass_netdev);
+ kfree(vbi);
+ vi->vbi = NULL;
}
static int virtnet_probe(struct virtio_device *vdev)
--
2.14.3
^ permalink raw reply related
* Re: [PATCH v21 1/5] xbitmap: Introduce xbitmap
From: Matthew Wilcox @ 2018-02-16 18:30 UTC (permalink / raw)
To: Andy Shevchenko
Cc: yang.zhang.wz, kvm, Michael S. Tsirkin, Tetsuo Handa,
liliang.opensource, qemu-devel, virtualization, linux-mm,
aarcange, virtio-dev, Matthew Wilcox, nilal, riel, cornelia.huck,
mhocko, quan.xu0, Linux Kernel Mailing List, amit.shah,
Paolo Bonzini, Andrew Morton, mgorman
In-Reply-To: <CAHp75Ve-1-TOVJUZ4anhwkkeq-RhpSg3EmN3N0r09rj6sFrQZQ@mail.gmail.com>
On Fri, Feb 16, 2018 at 07:44:50PM +0200, Andy Shevchenko wrote:
> On Tue, Jan 9, 2018 at 1:10 PM, Wei Wang <wei.w.wang@intel.com> wrote:
> > From: Matthew Wilcox <mawilcox@microsoft.com>
> >
> > The eXtensible Bitmap is a sparse bitmap representation which is
> > efficient for set bits which tend to cluster. It supports up to
> > 'unsigned long' worth of bits.
>
> > lib/xbitmap.c | 444 +++++++++++++++++++++++++++++++
>
> Please, split tests to a separate module.
Hah, I just did this two days ago! I didn't publish it yet, but I also made
it compile both in userspace and as a kernel module.
It's the top two commits here:
http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2018-02-12
Note this is a complete rewrite compared to the version presented here; it
sits on top of the XArray and no longer has a preload interface. It has a
superset of the IDA functionality.
^ permalink raw reply
* Re: [PATCH v21 1/5] xbitmap: Introduce xbitmap
From: Andy Shevchenko @ 2018-02-16 21:45 UTC (permalink / raw)
To: Matthew Wilcox
Cc: yang.zhang.wz, kvm, Michael S. Tsirkin, Tetsuo Handa,
liliang.opensource, qemu-devel, virtualization, linux-mm,
aarcange, virtio-dev, Matthew Wilcox, nilal, riel, cornelia.huck,
mhocko, quan.xu0, Linux Kernel Mailing List, amit.shah,
Paolo Bonzini, Andrew Morton, mgorman
In-Reply-To: <20180216183032.GA7439@bombadil.infradead.org>
On Fri, Feb 16, 2018 at 8:30 PM, Matthew Wilcox <willy@infradead.org> wrote:
> On Fri, Feb 16, 2018 at 07:44:50PM +0200, Andy Shevchenko wrote:
>> On Tue, Jan 9, 2018 at 1:10 PM, Wei Wang <wei.w.wang@intel.com> wrote:
>> > From: Matthew Wilcox <mawilcox@microsoft.com>
>> >
>> > The eXtensible Bitmap is a sparse bitmap representation which is
>> > efficient for set bits which tend to cluster. It supports up to
>> > 'unsigned long' worth of bits.
>>
>> > lib/xbitmap.c | 444 +++++++++++++++++++++++++++++++
>>
>> Please, split tests to a separate module.
>
> Hah, I just did this two days ago! I didn't publish it yet, but I also made
> it compile both in userspace and as a kernel module.
>
> It's the top two commits here:
>
> http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2018-02-12
>
Thanks!
> Note this is a complete rewrite compared to the version presented here; it
> sits on top of the XArray and no longer has a preload interface. It has a
> superset of the IDA functionality.
Noted.
Now, the question about test case. Why do you heavily use BUG_ON?
Isn't resulting statistics enough?
See how other lib/test_* modules do.
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply
* Re: [PATCH v21 1/5] xbitmap: Introduce xbitmap
From: Matthew Wilcox @ 2018-02-16 21:58 UTC (permalink / raw)
To: Andy Shevchenko
Cc: yang.zhang.wz, kvm, Michael S. Tsirkin, Tetsuo Handa,
liliang.opensource, qemu-devel, virtualization, linux-mm,
aarcange, virtio-dev, Matthew Wilcox, nilal, riel, cornelia.huck,
mhocko, quan.xu0, Linux Kernel Mailing List, amit.shah,
Paolo Bonzini, Andrew Morton, mgorman
In-Reply-To: <CAHp75Vd_tt0bV_OqAOwc=_uWrsF2zP9pMSbxPw_AxF_s9zj-pw@mail.gmail.com>
On Fri, Feb 16, 2018 at 11:45:51PM +0200, Andy Shevchenko wrote:
> Now, the question about test case. Why do you heavily use BUG_ON?
> Isn't resulting statistics enough?
No. If any of those tests fail, we want to stop dead. They'll lead to
horrendous bugs throughout the kernel if they're wrong. I think more of
the in-kernel test suite should stop dead instead of printing a warning.
Would you want to boot a machine which has a known bug in the page cache,
for example?
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jakub Kicinski @ 2018-02-17 2:38 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: alexander.h.duyck, virtio-dev, mst, netdev, virtualization,
loseweigh, davem
In-Reply-To: <1518804682-16881-1-git-send-email-sridhar.samudrala@intel.com>
On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
> Ppatch 2 is in response to the community request for a 3 netdev
> solution. However, it creates some issues we'll get into in a moment.
> It extends virtio_net to use alternate datapath when available and
> registered. When BACKUP feature is enabled, virtio_net driver creates
> an additional 'bypass' netdev that acts as a master device and controls
> 2 slave devices. The original virtio_net netdev is registered as
> 'backup' netdev and a passthru/vf device with the same MAC gets
> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
> associated with the same 'pci' device. The user accesses the network
> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
> as default for transmits when it is available with link up and running.
Thank you do doing this.
> We noticed a couple of issues with this approach during testing.
> - As both 'bypass' and 'backup' netdevs are associated with the same
> virtio pci device, udev tries to rename both of them with the same name
> and the 2nd rename will fail. This would be OK as long as the first netdev
> to be renamed is the 'bypass' netdev, but the order in which udev gets
> to rename the 2 netdevs is not reliable.
Out of curiosity - why do you link the master netdev to the virtio
struct device?
FWIW two solutions that immediately come to mind is to export "backup"
as phys_port_name of the backup virtio link and/or assign a name to the
master like you are doing already. I think team uses team%d and bond
uses bond%d, soft naming of master devices seems quite natural in this
case.
IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
link is quite neat.
> - When the 'active' netdev is unplugged OR not present on a destination
> system after live migration, the user will see 2 virtio_net netdevs.
That's necessary and expected, all configuration applies to the master
so master must exist.
^ permalink raw reply
* Re: [RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available
From: Jakub Kicinski @ 2018-02-17 3:04 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: alexander.h.duyck, Or Gerlitz, mst, netdev, virtualization,
loseweigh, davem
In-Reply-To: <1518804682-16881-3-git-send-email-sridhar.samudrala@intel.com>
On Fri, 16 Feb 2018 10:11:21 -0800, Sridhar Samudrala wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
>
> The hypervisor needs to enable only one datapath at any time so that
> packets don't get looped back to the VM over the other datapath. When a VF
> is plugged, the virtio datapath link state can be marked as down. The
> hypervisor needs to unplug the VF device from the guest on the source host
> and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed,
> the destination hypervisor sets the MAC filter on the VF and plugs it back
> to the guest to switch over to VF datapath.
>
> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
> created that acts as a master device and tracks the state of the 2 lower
> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
> passthru device with the same MAC is registered as 'active' netdev.
>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> +static void
> +virtnet_bypass_get_stats(struct net_device *dev,
> + struct rtnl_link_stats64 *stats)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + const struct rtnl_link_stats64 *new;
> + struct rtnl_link_stats64 temp;
> + struct net_device *child_netdev;
> +
> + spin_lock(&vbi->stats_lock);
> + memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
> +
> + rcu_read_lock();
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, &temp);
> + virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
> + memcpy(&vbi->active_stats, new, sizeof(*new));
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + new = dev_get_stats(child_netdev, &temp);
> + virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
> + memcpy(&vbi->backup_stats, new, sizeof(*new));
> + }
> +
> + rcu_read_unlock();
> +
> + memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
> + spin_unlock(&vbi->stats_lock);
> +}
> +
> +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
> +{
> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
> + struct net_device *child_netdev;
> + int ret = 0;
> +
> + child_netdev = rcu_dereference(vbi->active_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + return ret;
> + }
> +
> + child_netdev = rcu_dereference(vbi->backup_netdev);
> + if (child_netdev) {
> + ret = dev_set_mtu(child_netdev, new_mtu);
> + if (ret)
> + netdev_err(child_netdev,
> + "Unexpected failure to set mtu to %d\n",
> + new_mtu);
You should probably unwind if set fails on one of the legs.
> + }
> +
> + dev->mtu = new_mtu;
> + return 0;
> +}
nit: stats, mtu, all those mundane things are implemented in team
already. If we had this as kernel-internal team mode we wouldn't
have to reimplement them... You probably did investigate that
option, for my edification, would you mind saying what the
challenges/downsides were?
> +static struct net_device *
> +get_virtnet_bypass_bymac(struct net *net, const u8 *mac)
> +{
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> + continue; /* not a virtnet_bypass device */
Is there anything inherently wrong with enslaving another virtio dev
now? I was expecting something like a hash map to map MAC addr ->
master and then one can check if dev is already enslaved to that master.
Just a random thought, I'm probably missing something...
> + if (ether_addr_equal(mac, dev->perm_addr))
> + return dev;
> + }
> +
> + return NULL;
> +}
> +
> +static struct net_device *
> +get_virtnet_bypass_byref(struct net_device *child_netdev)
> +{
> + struct net *net = dev_net(child_netdev);
> + struct net_device *dev;
> +
> + ASSERT_RTNL();
> +
> + for_each_netdev(net, dev) {
> + struct virtnet_bypass_info *vbi;
> +
> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
> + continue; /* not a virtnet_bypass device */
> +
> + vbi = netdev_priv(dev);
> +
> + if ((rtnl_dereference(vbi->active_netdev) == child_netdev) ||
> + (rtnl_dereference(vbi->backup_netdev) == child_netdev))
nit: parens not needed
> + return dev; /* a match */
> + }
> +
> + return NULL;
> +}
> +static int virtnet_bypass_create(struct virtnet_info *vi)
> +{
> + struct net_device *backup_netdev = vi->dev;
> + struct device *dev = &vi->vdev->dev;
> + struct net_device *bypass_netdev;
> + int res;
> +
> + /* Alloc at least 2 queues, for now we are going with 16 assuming
> + * that most devices being bonded won't have too many queues.
> + */
> + bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
> + 16);
> + if (!bypass_netdev) {
> + dev_err(dev, "Unable to allocate bypass_netdev!\n");
> + return -ENOMEM;
> + }
Maybe it's just me but referring to master as bypass seems slightly
confusing. I know you don't like team and bond, but perhaps we can
come up with a better name? For me bypass device is the other leg,
i.e. the VF, not the master. Perhaps others disagree.
> + dev_net_set(bypass_netdev, dev_net(backup_netdev));
> + SET_NETDEV_DEV(bypass_netdev, dev);
> +
> + bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
> + bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
Thanks!
^ permalink raw reply
* Re: [PATCH v2 0/6] crypto: engine - Permit to enqueue all async requests
From: Herbert Xu @ 2018-02-17 4:42 UTC (permalink / raw)
To: Corentin Labbe
Cc: alexandre.torgue, corbet, mst, linux-doc, linux-kernel,
fabien.dessenne, virtualization, linux-sunxi, linux-crypto,
mcoquelin.stm32, davem, linux-arm-kernel
In-Reply-To: <20180216153656.GA22492@Red>
On Fri, Feb 16, 2018 at 04:36:56PM +0100, Corentin Labbe wrote:
>
> As mentionned in the cover letter, all patchs (except documentation one) should be squashed.
> A kbuild robot reported a build error on cryptodev due to this.
It's too late now. In future if you want the patches to be squashed
then please send them in one email.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* [PATCH] virtio_balloon: export huge page allocation statistics
From: Jonathan Helman @ 2018-02-17 5:44 UTC (permalink / raw)
To: mst, jasowang, virtualization; +Cc: linux-kernel
Export statistics for successful and failed huge page allocations
from the virtio balloon driver. These 2 stats come directly from
the vm_events HTLB_BUDDY_PGALLOC and HTLB_BUDDY_PGALLOC_FAIL.
Signed-off-by: Jonathan Helman <jonathan.helman@oracle.com>
---
drivers/virtio/virtio_balloon.c | 6 ++++++
include/uapi/linux/virtio_balloon.h | 4 +++-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index dfe5684..6b237e3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -272,6 +272,12 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+#ifdef CONFIG_HUGETLB_PAGE
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
+ events[HTLB_BUDDY_PGALLOC]);
+ update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
+ events[HTLB_BUDDY_PGALLOC_FAIL]);
+#endif
#endif
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 4e8b830..e3e8071 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -53,7 +53,9 @@ struct virtio_balloon_config {
#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */
#define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */
#define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
-#define VIRTIO_BALLOON_S_NR 8
+#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Number of htlb pgalloc successes */
+#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Number of htlb pgalloc failures */
+#define VIRTIO_BALLOON_S_NR 10
/*
* Memory statistics structure.
--
1.8.3.1
^ permalink raw reply related
* (unknown),
From: Solen win @ 2018-02-17 8:41 UTC (permalink / raw)
To: Virtualization
[-- Attachment #1.1: Type: text/plain, Size: 8 bytes --]
Confirm
[-- Attachment #1.2: Type: text/html, Size: 30 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-17 17:12 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Sridhar Samudrala, virtualization, Siwei Liu, Netdev,
David Miller
In-Reply-To: <20180216183817.42b07af6@cakuba.netronome.com>
On Fri, Feb 16, 2018 at 6:38 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Fri, 16 Feb 2018 10:11:19 -0800, Sridhar Samudrala wrote:
>> Ppatch 2 is in response to the community request for a 3 netdev
>> solution. However, it creates some issues we'll get into in a moment.
>> It extends virtio_net to use alternate datapath when available and
>> registered. When BACKUP feature is enabled, virtio_net driver creates
>> an additional 'bypass' netdev that acts as a master device and controls
>> 2 slave devices. The original virtio_net netdev is registered as
>> 'backup' netdev and a passthru/vf device with the same MAC gets
>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>> associated with the same 'pci' device. The user accesses the network
>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>> as default for transmits when it is available with link up and running.
>
> Thank you do doing this.
>
>> We noticed a couple of issues with this approach during testing.
>> - As both 'bypass' and 'backup' netdevs are associated with the same
>> virtio pci device, udev tries to rename both of them with the same name
>> and the 2nd rename will fail. This would be OK as long as the first netdev
>> to be renamed is the 'bypass' netdev, but the order in which udev gets
>> to rename the 2 netdevs is not reliable.
>
> Out of curiosity - why do you link the master netdev to the virtio
> struct device?
The basic idea of all this is that we wanted this to work with an
existing VM image that was using virtio. As such we were trying to
make it so that the bypass interface takes the place of the original
virtio and get udev to rename the bypass to what the original
virtio_net was.
> FWIW two solutions that immediately come to mind is to export "backup"
> as phys_port_name of the backup virtio link and/or assign a name to the
> master like you are doing already. I think team uses team%d and bond
> uses bond%d, soft naming of master devices seems quite natural in this
> case.
I figured I had overlooked something like that.. Thanks for pointing
this out. Okay so I think the phys_port_name approach might resolve
the original issue. If I am reading things correctly what we end up
with is the master showing up as "ens1" for example and the backup
showing up as "ens1nbackup". Am I understanding that right?
The problem with the team/bond%d approach is that it creates a new
netdevice and so it would require guest configuration changes.
> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
> link is quite neat.
I agree. For non-"backup" virio_net devices would it be okay for us to
just return -EOPNOTSUPP? I assume it would be and that way the legacy
behavior could be maintained although the function still exists.
>> - When the 'active' netdev is unplugged OR not present on a destination
>> system after live migration, the user will see 2 virtio_net netdevs.
>
> That's necessary and expected, all configuration applies to the master
> so master must exist.
With the naming issue resolved this is the only item left outstanding.
This becomes a matter of form vs function.
The main complaint about the "3 netdev" solution is a bit confusing to
have the 2 netdevs present if the VF isn't there. The idea is that
having the extra "master" netdev there if there isn't really a bond is
a bit ugly.
The downside of the "2 netdev" solution is that you have to deal with
an extra layer of locking/queueing to get to the VF and you lose some
functionality since things like in-driver XDP have to be disabled in
order to maintain the same functionality when the VF is present or
not. However it looks more like classic virtio_net when the VF is not
present.
^ permalink raw reply
* Re: [RFC PATCH v3 2/3] virtio_net: Extend virtio to use VF datapath when available
From: Alexander Duyck @ 2018-02-17 17:41 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Duyck, Alexander H, Or Gerlitz, Michael S. Tsirkin,
Sridhar Samudrala, virtualization, Siwei Liu, Netdev,
David Miller
In-Reply-To: <20180216190455.2a0ea023@cakuba.netronome.com>
On Fri, Feb 16, 2018 at 7:04 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Fri, 16 Feb 2018 10:11:21 -0800, Sridhar Samudrala wrote:
>> This patch enables virtio_net to switch over to a VF datapath when a VF
>> netdev is present with the same MAC address. It allows live migration
>> of a VM with a direct attached VF without the need to setup a bond/team
>> between a VF and virtio net device in the guest.
>>
>> The hypervisor needs to enable only one datapath at any time so that
>> packets don't get looped back to the VM over the other datapath. When a VF
>> is plugged, the virtio datapath link state can be marked as down. The
>> hypervisor needs to unplug the VF device from the guest on the source host
>> and reset the MAC filter of the VF to initiate failover of datapath to
>> virtio before starting the migration. After the migration is completed,
>> the destination hypervisor sets the MAC filter on the VF and plugs it back
>> to the guest to switch over to VF datapath.
>>
>> When BACKUP feature is enabled, an additional netdev(bypass netdev) is
>> created that acts as a master device and tracks the state of the 2 lower
>> netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
>> passthru device with the same MAC is registered as 'active' netdev.
>>
>> This patch is based on the discussion initiated by Jesse on this thread.
>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>
>> +static void
>> +virtnet_bypass_get_stats(struct net_device *dev,
>> + struct rtnl_link_stats64 *stats)
>> +{
>> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> + const struct rtnl_link_stats64 *new;
>> + struct rtnl_link_stats64 temp;
>> + struct net_device *child_netdev;
>> +
>> + spin_lock(&vbi->stats_lock);
>> + memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
>> +
>> + rcu_read_lock();
>> +
>> + child_netdev = rcu_dereference(vbi->active_netdev);
>> + if (child_netdev) {
>> + new = dev_get_stats(child_netdev, &temp);
>> + virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
>> + memcpy(&vbi->active_stats, new, sizeof(*new));
>> + }
>> +
>> + child_netdev = rcu_dereference(vbi->backup_netdev);
>> + if (child_netdev) {
>> + new = dev_get_stats(child_netdev, &temp);
>> + virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
>> + memcpy(&vbi->backup_stats, new, sizeof(*new));
>> + }
>> +
>> + rcu_read_unlock();
>> +
>> + memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
>> + spin_unlock(&vbi->stats_lock);
>> +}
>> +
>> +static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
>> +{
>> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> + struct net_device *child_netdev;
>> + int ret = 0;
>> +
>> + child_netdev = rcu_dereference(vbi->active_netdev);
>> + if (child_netdev) {
>> + ret = dev_set_mtu(child_netdev, new_mtu);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + child_netdev = rcu_dereference(vbi->backup_netdev);
>> + if (child_netdev) {
>> + ret = dev_set_mtu(child_netdev, new_mtu);
>> + if (ret)
>> + netdev_err(child_netdev,
>> + "Unexpected failure to set mtu to %d\n",
>> + new_mtu);
>
> You should probably unwind if set fails on one of the legs.
Actually if we know that the backup is always going to be a virtio I
don't know if we even need to worry about the call failing. Last I
knew virtio_net doesn't implement ndo_change_mtu so I don't think it
is an issue. Unless a notifier blows up about it somewhere I don't
think there is anything that should prevent us from updating the MTU.
One interesting thing we may want to take a look at would be to tweak
the ordering of things based on if we are increasing or decreasing the
MTU. In the case of a increase we need to work from the bottom up, but
in the case of a decrease I wonder if we shouldn't be working from the
top down in order to guarantee we don't create an MTU mismatch
somewhere in the path.
>> + }
>> +
>> + dev->mtu = new_mtu;
>> + return 0;
>> +}
>
> nit: stats, mtu, all those mundane things are implemented in team
> already. If we had this as kernel-internal team mode we wouldn't
> have to reimplement them... You probably did investigate that
> option, for my edification, would you mind saying what the
> challenges/downsides were?
So I tried working with the bonding driver to get down to what we
needed. The issue is there are so many controls and such exposed that
trying to pull them out and generate a simple bond became very
difficult to get done. It was just much easier to start over versus
trying to take an existing interface and pare it down to just what we
needed. What may make more sense is to in the future create either a
lib or some sort of file in net/core/ that we could consolidate the
core functionality of these type of devices into and leave the
user-space interfaces, debugfs, ioctls, and such out of and leave
those driver specific.
>> +static struct net_device *
>> +get_virtnet_bypass_bymac(struct net *net, const u8 *mac)
>> +{
>> + struct net_device *dev;
>> +
>> + ASSERT_RTNL();
>> +
>> + for_each_netdev(net, dev) {
>> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
>> + continue; /* not a virtnet_bypass device */
>
> Is there anything inherently wrong with enslaving another virtio dev
> now? I was expecting something like a hash map to map MAC addr ->
> master and then one can check if dev is already enslaved to that master.
> Just a random thought, I'm probably missing something...
This isn't about enslaving, this is just finding the master device.
Basically the virtnet_bypass uses a separate set of netdev ops so we
just are using that instead of maintaining a global or per-net hash.
You could have two virtio devices enslaved by the same virtnet_bypass
and it shouldn't be an issue.
>> + if (ether_addr_equal(mac, dev->perm_addr))
>> + return dev;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static struct net_device *
>> +get_virtnet_bypass_byref(struct net_device *child_netdev)
>> +{
>> + struct net *net = dev_net(child_netdev);
>> + struct net_device *dev;
>> +
>> + ASSERT_RTNL();
>> +
>> + for_each_netdev(net, dev) {
>> + struct virtnet_bypass_info *vbi;
>> +
>> + if (dev->netdev_ops != &virtnet_bypass_netdev_ops)
>> + continue; /* not a virtnet_bypass device */
>> +
>> + vbi = netdev_priv(dev);
>> +
>> + if ((rtnl_dereference(vbi->active_netdev) == child_netdev) ||
>> + (rtnl_dereference(vbi->backup_netdev) == child_netdev))
>
> nit: parens not needed
Yeah, it is a habit of mine since I do that for readability more than
anything else. Some people can't track the order of operations.
If they need to go they can.
>> + return dev; /* a match */
>> + }
>> +
>> + return NULL;
>> +}
>
>> +static int virtnet_bypass_create(struct virtnet_info *vi)
>> +{
>> + struct net_device *backup_netdev = vi->dev;
>> + struct device *dev = &vi->vdev->dev;
>> + struct net_device *bypass_netdev;
>> + int res;
>> +
>> + /* Alloc at least 2 queues, for now we are going with 16 assuming
>> + * that most devices being bonded won't have too many queues.
>> + */
>> + bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
>> + 16);
>> + if (!bypass_netdev) {
>> + dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> + return -ENOMEM;
>> + }
>
> Maybe it's just me but referring to master as bypass seems slightly
> confusing. I know you don't like team and bond, but perhaps we can
> come up with a better name? For me bypass device is the other leg,
> i.e. the VF, not the master. Perhaps others disagree.
The choice of naming is based on some basic plumbing ideas. You could
almost think of the bypass netdev as more of a bypass valve. Basically
what we are doing is rerouting the traffic at the bypass interface and
sending it to the VF instead of routing it to the virtio_net. That was
why I thought it would be a fitting term. It gets us out of the
"bond", "team", and other such concepts because it isn't really any of
those. This is supposed to be a very simple device that will shunt the
traffic off of the virtio_net and re-route it through the VF. In
addition this isn't a complete solution by itself either since there
will still be some configuration required on the host to take care of
applying filters on the PF and/or tap interface to make certain we
don't end up with any loops in the traffic and that receives are
directed to the VF.
>> + dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> + SET_NETDEV_DEV(bypass_netdev, dev);
>> +
>> + bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
>> + bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
>
> Thanks!
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jakub Kicinski @ 2018-02-19 6:11 UTC (permalink / raw)
To: Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Sridhar Samudrala, virtualization, Siwei Liu, Netdev,
David Miller
In-Reply-To: <CAKgT0UfU9gV2Y7hWFQi0vJoD9kYeoq+7cXmwAjXeV5zxXHa3dQ@mail.gmail.com>
On Sat, 17 Feb 2018 09:12:01 -0800, Alexander Duyck wrote:
> >> We noticed a couple of issues with this approach during testing.
> >> - As both 'bypass' and 'backup' netdevs are associated with the same
> >> virtio pci device, udev tries to rename both of them with the same name
> >> and the 2nd rename will fail. This would be OK as long as the first netdev
> >> to be renamed is the 'bypass' netdev, but the order in which udev gets
> >> to rename the 2 netdevs is not reliable.
> >
> > Out of curiosity - why do you link the master netdev to the virtio
> > struct device?
>
> The basic idea of all this is that we wanted this to work with an
> existing VM image that was using virtio. As such we were trying to
> make it so that the bypass interface takes the place of the original
> virtio and get udev to rename the bypass to what the original
> virtio_net was.
That makes sense. Is it udev/naming that you're most concerned about
here? I.e. what's the user space app that expects the netdev to be
linked? This is just out of curiosity, the linking of netdevs to
devices is a bit of a PITA in the switchdev eswitch mode world, with
libvirt expecting only certain devices to be there.. Right now we're
not linking VF reprs, which breaks naming. I wanted to revisit that.
> > FWIW two solutions that immediately come to mind is to export "backup"
> > as phys_port_name of the backup virtio link and/or assign a name to the
> > master like you are doing already. I think team uses team%d and bond
> > uses bond%d, soft naming of master devices seems quite natural in this
> > case.
>
> I figured I had overlooked something like that.. Thanks for pointing
> this out. Okay so I think the phys_port_name approach might resolve
> the original issue. If I am reading things correctly what we end up
> with is the master showing up as "ens1" for example and the backup
> showing up as "ens1nbackup". Am I understanding that right?
Yes, provided systemd is new enough.
> The problem with the team/bond%d approach is that it creates a new
> netdevice and so it would require guest configuration changes.
>
> > IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
> > link is quite neat.
>
> I agree. For non-"backup" virio_net devices would it be okay for us to
> just return -EOPNOTSUPP? I assume it would be and that way the legacy
> behavior could be maintained although the function still exists.
That's my understanding too.
^ permalink raw reply
* Re: [PATCH] drm/bochs: make structure bochs_bo_driver static
From: Daniel Vetter @ 2018-02-19 9:43 UTC (permalink / raw)
To: Colin King
Cc: David Airlie, kernel-janitors, linux-kernel, dri-devel,
virtualization
In-Reply-To: <20180207111353.30261-1-colin.king@canonical.com>
On Wed, Feb 07, 2018 at 11:13:53AM +0000, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> The structure bochs_bo_driver is local to the source and does not need to
> be in global scope, so make it static.
>
> Cleans up sparse warning:
> drivers/gpu/drm/bochs/bochs_mm.c:197:22: warning: symbol 'bochs_bo_driver'
> was not declared. Should it be static?
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
Applied to drm-misc-next, thanks.
-Daniel
> ---
> drivers/gpu/drm/bochs/bochs_mm.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/bochs/bochs_mm.c b/drivers/gpu/drm/bochs/bochs_mm.c
> index 704e879711e4..b1d5aee46316 100644
> --- a/drivers/gpu/drm/bochs/bochs_mm.c
> +++ b/drivers/gpu/drm/bochs/bochs_mm.c
> @@ -194,7 +194,7 @@ static struct ttm_tt *bochs_ttm_tt_create(struct ttm_bo_device *bdev,
> return tt;
> }
>
> -struct ttm_bo_driver bochs_bo_driver = {
> +static struct ttm_bo_driver bochs_bo_driver = {
> .ttm_tt_create = bochs_ttm_tt_create,
> .ttm_tt_populate = ttm_pool_populate,
> .ttm_tt_unpopulate = ttm_pool_unpopulate,
> --
> 2.15.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-20 10:42 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <1518804682-16881-1-git-send-email-sridhar.samudrala@intel.com>
Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>used by hypervisor to indicate that virtio_net interface should act as
>a backup for another device with the same MAC address.
>
>Ppatch 2 is in response to the community request for a 3 netdev
>solution. However, it creates some issues we'll get into in a moment.
>It extends virtio_net to use alternate datapath when available and
>registered. When BACKUP feature is enabled, virtio_net driver creates
>an additional 'bypass' netdev that acts as a master device and controls
>2 slave devices. The original virtio_net netdev is registered as
>'backup' netdev and a passthru/vf device with the same MAC gets
>registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>associated with the same 'pci' device. The user accesses the network
>interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>as default for transmits when it is available with link up and running.
Sorry, but this is ridiculous. You are apparently re-implemeting part
of bonding driver as a part of NIC driver. Bond and team drivers
are mature solutions, well tested, broadly used, with lots of issues
resolved in the past. What you try to introduce is a weird shortcut
that already has couple of issues as you mentioned and will certanly
have many more. Also, I'm pretty sure that in future, someone comes up
with ideas like multiple VFs, LACP and similar bonding things.
What is the reason for this abomination? According to:
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
The reason is quite weak.
User in the vm sees 2 (or more) netdevices, he puts them in bond/team
and that's it. This works now! If the vm lacks some userspace features,
let's fix it there! For example the MAC changes is something that could
be easily handled in teamd userspace deamon.
^ permalink raw reply
* Re: [PATCH 1/4] iommu: Add virtio-iommu driver
From: Jean-Philippe Brucker @ 2018-02-20 11:30 UTC (permalink / raw)
To: Tomasz Nowicki, iommu@lists.linux-foundation.org,
kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
virtio-dev@lists.oasis-open.org, kvmarm@lists.cs.columbia.edu
Cc: jayachandran.nair@cavium.com, Lorenzo Pieralisi,
tnowicki@caviumnetworks.com, mst@redhat.com, Marc Zyngier,
Will Deacon, jintack@cs.columbia.edu, eric.auger@redhat.com,
Robin Murphy, joro@8bytes.org, eric.auger.pro@gmail.com
In-Reply-To: <417e1617-e5ff-7b13-64e3-a8a98d30bcb9@semihalf.com>
On 19/02/18 12:23, Tomasz Nowicki wrote:
[...]
>> +static int viommu_receive_resp(struct viommu_dev *viommu, int nr_sent,
>> + struct list_head *sent)
>> +{
>> +
>> + unsigned int len;
>> + int nr_received = 0;
>> + struct viommu_request *req, *pending;
>> +
>> + pending = list_first_entry_or_null(sent, struct viommu_request, list);
>> + if (WARN_ON(!pending))
>> + return 0;
>> +
>> + while ((req = virtqueue_get_buf(viommu->vq, &len)) != NULL) {
>> + if (req != pending) {
>> + dev_warn(viommu->dev, "discarding stale request\n");
>> + continue;
>> + }
>> +
>> + pending->written = len;
>> +
>> + if (++nr_received == nr_sent) {
>> + WARN_ON(!list_is_last(&pending->list, sent));
>> + break;
>> + } else if (WARN_ON(list_is_last(&pending->list, sent))) {
>> + break;
>> + }
>> +
>> + pending = list_next_entry(pending, list);
>
> We should remove current element from the pending list. There is no
> guarantee we get response for each while loop so when we get back for
> more the _viommu_send_reqs_sync() caller will pass pointer to the out of
> date head next time.
Right, I'll fix this
Thanks,
Jean
^ permalink raw reply
* [4.4-stable 12/22] virtio_balloon: prevent uninitialized variable use
From: Arnd Bergmann @ 2018-02-20 11:54 UTC (permalink / raw)
To: stable
Cc: Arnd Bergmann, Michael S . Tsirkin, Greg KH, Ladi Prosek,
linux-kernel, virtualization
In-Reply-To: <20180220115527.1806578-1-arnd@arndb.de>
commit f0bb2d50dfcc519f06f901aac88502be6ff1df2c upstream.
The latest gcc-7.0.1 snapshot reports a new warning:
virtio/virtio_balloon.c: In function 'update_balloon_stats':
virtio/virtio_balloon.c:258:26: error: 'events[2]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:260:26: error: 'events[3]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:261:56: error: 'events[18]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:262:56: error: 'events[17]' is used uninitialized in this function [-Werror=uninitialized]
This seems absolutely right, so we should add an extra check to
prevent copying uninitialized stack data into the statistics.
From all I can tell, this has been broken since the statistics code
was originally added in 2.6.34.
Fixes: 9564e138b1f6 ("virtio: Add memory statistics reporting to the balloon driver (V4)")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
[arnd: backported to 4.4]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
drivers/virtio/virtio_balloon.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 01d15dca940e..26d0dff069f0 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -239,12 +239,15 @@ static void update_balloon_stats(struct virtio_balloon *vb)
all_vm_events(events);
si_meminfo(&i);
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+#endif
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
--
2.9.0
^ permalink raw reply related
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-20 16:04 UTC (permalink / raw)
To: Jiri Pirko
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Sridhar Samudrala, virtualization, Siwei Liu,
Netdev, David Miller
In-Reply-To: <20180220104224.GA2031@nanopsycho>
On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>used by hypervisor to indicate that virtio_net interface should act as
>>a backup for another device with the same MAC address.
>>
>>Ppatch 2 is in response to the community request for a 3 netdev
>>solution. However, it creates some issues we'll get into in a moment.
>>It extends virtio_net to use alternate datapath when available and
>>registered. When BACKUP feature is enabled, virtio_net driver creates
>>an additional 'bypass' netdev that acts as a master device and controls
>>2 slave devices. The original virtio_net netdev is registered as
>>'backup' netdev and a passthru/vf device with the same MAC gets
>>registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>associated with the same 'pci' device. The user accesses the network
>>interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>as default for transmits when it is available with link up and running.
>
> Sorry, but this is ridiculous. You are apparently re-implemeting part
> of bonding driver as a part of NIC driver. Bond and team drivers
> are mature solutions, well tested, broadly used, with lots of issues
> resolved in the past. What you try to introduce is a weird shortcut
> that already has couple of issues as you mentioned and will certanly
> have many more. Also, I'm pretty sure that in future, someone comes up
> with ideas like multiple VFs, LACP and similar bonding things.
The problem with the bond and team drivers is they are too large and
have too many interfaces available for configuration so as a result
they can really screw this interface up.
Essentially this is meant to be a bond that is more-or-less managed by
the host, not the guest. We want the host to be able to configure it
and have it automatically kick in on the guest. For now we want to
avoid adding too much complexity as this is meant to be just the first
step. Trying to go in and implement the whole solution right from the
start based on existing drivers is going to be a massive time sink and
will likely never get completed due to the fact that there is always
going to be some other thing that will interfere.
My personal hope is that we can look at doing a virtio-bond sort of
device that will handle all this as well as providing a communication
channel, but that is much further down the road. For now we only have
a single bit so the goal for now is trying to keep this as simple as
possible.
> What is the reason for this abomination? According to:
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
> The reason is quite weak.
> User in the vm sees 2 (or more) netdevices, he puts them in bond/team
> and that's it. This works now! If the vm lacks some userspace features,
> let's fix it there! For example the MAC changes is something that could
> be easily handled in teamd userspace deamon.
I think you might have missed the point of this. This is meant to be a
simple interface so the guest should not be able to change the MAC
address, and it shouldn't require any userspace daemon to setup or
tear down. Ideally with this solution the virtio bypass will come up
and be assigned the name of the original virtio, and the "backup"
interface will come up and be assigned the name of the original virtio
with an additional "nbackup" tacked on via the phys_port_name, and
then whenever a VF is added it will automatically be enslaved by the
bypass interface, and it will be removed when the VF is hotplugged
out.
In my mind the difference between this and bond or team is where the
configuration interface lies. In the case of bond it is in the kernel.
If my understanding is correct team is mostly in user space. With this
the configuration interface is really down in the hypervisor and
requests are communicated up to the guest. I would prefer not to make
virtio_net dependent on the bonding or team drivers, or worse yet a
userspace daemon in the guest. For now I would argue we should keep
this as simple as possible just to support basic live migration. There
has already been discussions of refactoring this after it is in so
that we can start to combine the functionality here with what is there
in bonding/team, but the differences in configuration interface and
the size of the code bases will make it challenging to outright merge
this into something like that.
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Samudrala, Sridhar @ 2018-02-20 16:26 UTC (permalink / raw)
To: Jakub Kicinski, Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin, Netdev,
virtualization, Siwei Liu, David Miller
In-Reply-To: <20180218221140.6c7fb32e@cakuba.netronome.com>
On 2/18/2018 10:11 PM, Jakub Kicinski wrote:
> On Sat, 17 Feb 2018 09:12:01 -0800, Alexander Duyck wrote:
>>>> We noticed a couple of issues with this approach during testing.
>>>> - As both 'bypass' and 'backup' netdevs are associated with the same
>>>> virtio pci device, udev tries to rename both of them with the same name
>>>> and the 2nd rename will fail. This would be OK as long as the first netdev
>>>> to be renamed is the 'bypass' netdev, but the order in which udev gets
>>>> to rename the 2 netdevs is not reliable.
>>> Out of curiosity - why do you link the master netdev to the virtio
>>> struct device?
>> The basic idea of all this is that we wanted this to work with an
>> existing VM image that was using virtio. As such we were trying to
>> make it so that the bypass interface takes the place of the original
>> virtio and get udev to rename the bypass to what the original
>> virtio_net was.
> That makes sense. Is it udev/naming that you're most concerned about
> here? I.e. what's the user space app that expects the netdev to be
> linked? This is just out of curiosity, the linking of netdevs to
> devices is a bit of a PITA in the switchdev eswitch mode world, with
> libvirt expecting only certain devices to be there.. Right now we're
> not linking VF reprs, which breaks naming. I wanted to revisit that.
For live migration usecase, userspace is only aware of one virtio_net
interface and
it doesn't expect it to be linked with any lower dev. So it should be
fine even if the
lower netdev is not present. Only the master netdev should be assigned
the same
name so that userspace configuration scripts in the VM don't need to
change.
>
>>> FWIW two solutions that immediately come to mind is to export "backup"
>>> as phys_port_name of the backup virtio link and/or assign a name to the
>>> master like you are doing already. I think team uses team%d and bond
>>> uses bond%d, soft naming of master devices seems quite natural in this
>>> case.
>> I figured I had overlooked something like that.. Thanks for pointing
>> this out. Okay so I think the phys_port_name approach might resolve
>> the original issue. If I am reading things correctly what we end up
>> with is the master showing up as "ens1" for example and the backup
>> showing up as "ens1nbackup". Am I understanding that right?
> Yes, provided systemd is new enough.
Yes. I did a quick test to confirm that adding ndo_phys_port_name() to
virtio_net
ndo_ops fixes the udev naming issue with 2 virtio netdevs. This is on
fedora27.
>
>> The problem with the team/bond%d approach is that it creates a new
>> netdevice and so it would require guest configuration changes.
>>
>>> IMHO phys_port_name == "backup" if BACKUP bit is set on slave virtio
>>> link is quite neat.
>> I agree. For non-"backup" virio_net devices would it be okay for us to
>> just return -EOPNOTSUPP? I assume it would be and that way the legacy
>> behavior could be maintained although the function still exists.
> That's my understanding too.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-20 16:29 UTC (permalink / raw)
To: Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Sridhar Samudrala, virtualization, Siwei Liu,
Netdev, David Miller
In-Reply-To: <CAKgT0UdU8PDXduzxp4kKfur-DLeFQSJj7-fhW_eTgVzd+AcViw@mail.gmail.com>
Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>>Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>>used by hypervisor to indicate that virtio_net interface should act as
>>>a backup for another device with the same MAC address.
>>>
>>>Ppatch 2 is in response to the community request for a 3 netdev
>>>solution. However, it creates some issues we'll get into in a moment.
>>>It extends virtio_net to use alternate datapath when available and
>>>registered. When BACKUP feature is enabled, virtio_net driver creates
>>>an additional 'bypass' netdev that acts as a master device and controls
>>>2 slave devices. The original virtio_net netdev is registered as
>>>'backup' netdev and a passthru/vf device with the same MAC gets
>>>registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>associated with the same 'pci' device. The user accesses the network
>>>interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>as default for transmits when it is available with link up and running.
>>
>> Sorry, but this is ridiculous. You are apparently re-implemeting part
>> of bonding driver as a part of NIC driver. Bond and team drivers
>> are mature solutions, well tested, broadly used, with lots of issues
>> resolved in the past. What you try to introduce is a weird shortcut
>> that already has couple of issues as you mentioned and will certanly
>> have many more. Also, I'm pretty sure that in future, someone comes up
>> with ideas like multiple VFs, LACP and similar bonding things.
>
>The problem with the bond and team drivers is they are too large and
>have too many interfaces available for configuration so as a result
>they can really screw this interface up.
What? Too large is which sense? Why "too many interfaces" is a problem?
Also, team has only one interface to userspace team-generic-netlink.
>
>Essentially this is meant to be a bond that is more-or-less managed by
>the host, not the guest. We want the host to be able to configure it
How is it managed by the host? In your usecase the guest has 2 netdevs:
virtio_net, pci vf.
I don't see how host can do any managing of that, other than the
obvious. But still, the active/backup decision is done in guest. This is
a simple bond/team usecase. As I said, there is something needed to be
implemented in userspace in order to handle re-appear of vf netdev.
But that should be fairly easy to do in teamd.
>and have it automatically kick in on the guest. For now we want to
>avoid adding too much complexity as this is meant to be just the first
That's what I fear, "for now"..
>step. Trying to go in and implement the whole solution right from the
>start based on existing drivers is going to be a massive time sink and
>will likely never get completed due to the fact that there is always
>going to be some other thing that will interfere.
"implement the whole solution right from the start based on existing
drivers" - what solution are you talking about? I don't understand this
para.
>
>My personal hope is that we can look at doing a virtio-bond sort of
>device that will handle all this as well as providing a communication
>channel, but that is much further down the road. For now we only have
>a single bit so the goal for now is trying to keep this as simple as
>possible.
Oh. So there is really intention to do re-implementation of bonding
in virtio. That is plain-wrong in my opinion.
Could you just use bond/team, please, and don't reinvent the wheel with
this abomination?
>
>> What is the reason for this abomination? According to:
>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>> The reason is quite weak.
>> User in the vm sees 2 (or more) netdevices, he puts them in bond/team
>> and that's it. This works now! If the vm lacks some userspace features,
>> let's fix it there! For example the MAC changes is something that could
>> be easily handled in teamd userspace deamon.
>
>I think you might have missed the point of this. This is meant to be a
>simple interface so the guest should not be able to change the MAC
>address, and it shouldn't require any userspace daemon to setup or
>tear down. Ideally with this solution the virtio bypass will come up
>and be assigned the name of the original virtio, and the "backup"
>interface will come up and be assigned the name of the original virtio
>with an additional "nbackup" tacked on via the phys_port_name, and
>then whenever a VF is added it will automatically be enslaved by the
>bypass interface, and it will be removed when the VF is hotplugged
>out.
>
>In my mind the difference between this and bond or team is where the
>configuration interface lies. In the case of bond it is in the kernel.
>If my understanding is correct team is mostly in user space. With this
>the configuration interface is really down in the hypervisor and
>requests are communicated up to the guest. I would prefer not to make
>virtio_net dependent on the bonding or team drivers, or worse yet a
>userspace daemon in the guest. For now I would argue we should keep
>this as simple as possible just to support basic live migration. There
>has already been discussions of refactoring this after it is in so
>that we can start to combine the functionality here with what is there
>in bonding/team, but the differences in configuration interface and
>the size of the code bases will make it challenging to outright merge
>this into something like that.
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Samudrala, Sridhar @ 2018-02-20 17:14 UTC (permalink / raw)
To: Jiri Pirko, Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Netdev, virtualization, Siwei Liu, David Miller
In-Reply-To: <20180220162933.GD2031@nanopsycho>
On 2/20/2018 8:29 AM, Jiri Pirko wrote:
> Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>> On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>>> Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>>> used by hypervisor to indicate that virtio_net interface should act as
>>>> a backup for another device with the same MAC address.
>>>>
>>>> Ppatch 2 is in response to the community request for a 3 netdev
>>>> solution. However, it creates some issues we'll get into in a moment.
>>>> It extends virtio_net to use alternate datapath when available and
>>>> registered. When BACKUP feature is enabled, virtio_net driver creates
>>>> an additional 'bypass' netdev that acts as a master device and controls
>>>> 2 slave devices. The original virtio_net netdev is registered as
>>>> 'backup' netdev and a passthru/vf device with the same MAC gets
>>>> registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>> associated with the same 'pci' device. The user accesses the network
>>>> interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>> as default for transmits when it is available with link up and running.
>>> Sorry, but this is ridiculous. You are apparently re-implemeting part
>>> of bonding driver as a part of NIC driver. Bond and team drivers
>>> are mature solutions, well tested, broadly used, with lots of issues
>>> resolved in the past. What you try to introduce is a weird shortcut
>>> that already has couple of issues as you mentioned and will certanly
>>> have many more. Also, I'm pretty sure that in future, someone comes up
>>> with ideas like multiple VFs, LACP and similar bonding things.
>> The problem with the bond and team drivers is they are too large and
>> have too many interfaces available for configuration so as a result
>> they can really screw this interface up.
> What? Too large is which sense? Why "too many interfaces" is a problem?
> Also, team has only one interface to userspace team-generic-netlink.
>
>
>> Essentially this is meant to be a bond that is more-or-less managed by
>> the host, not the guest. We want the host to be able to configure it
> How is it managed by the host? In your usecase the guest has 2 netdevs:
> virtio_net, pci vf.
> I don't see how host can do any managing of that, other than the
> obvious. But still, the active/backup decision is done in guest. This is
> a simple bond/team usecase. As I said, there is something needed to be
> implemented in userspace in order to handle re-appear of vf netdev.
> But that should be fairly easy to do in teamd.
The host manages the active/backup decision by
- assigning the same MAC address to both VF and virtio interfaces
- setting a BACKUP feature bit on virtio that enables virtio to
transparently take
over the VFs datapath.
- only enable one datapath at anytime so that packets don't get looped back
- during live migration enable virtio datapth, unplug vf on the source
and replug
vf on the destination.
The VM is not expected and doesn't have any control of setting the MAC
address
or bringing up/down the links.
This is the model that is currently supported with netvsc driver on Azure.
>
>
>> and have it automatically kick in on the guest. For now we want to
>> avoid adding too much complexity as this is meant to be just the first
> That's what I fear, "for now"..
>
>
>> step. Trying to go in and implement the whole solution right from the
>> start based on existing drivers is going to be a massive time sink and
>> will likely never get completed due to the fact that there is always
>> going to be some other thing that will interfere.
> "implement the whole solution right from the start based on existing
> drivers" - what solution are you talking about? I don't understand this
> para.
>
>
>> My personal hope is that we can look at doing a virtio-bond sort of
>> device that will handle all this as well as providing a communication
>> channel, but that is much further down the road. For now we only have
>> a single bit so the goal for now is trying to keep this as simple as
>> possible.
> Oh. So there is really intention to do re-implementation of bonding
> in virtio. That is plain-wrong in my opinion.
>
> Could you just use bond/team, please, and don't reinvent the wheel with
> this abomination?
>
>>> What is the reason for this abomination? According to:
>>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>> The reason is quite weak.
>>> User in the vm sees 2 (or more) netdevices, he puts them in bond/team
>>> and that's it. This works now! If the vm lacks some userspace features,
>>> let's fix it there! For example the MAC changes is something that could
>>> be easily handled in teamd userspace deamon.
>> I think you might have missed the point of this. This is meant to be a
>> simple interface so the guest should not be able to change the MAC
>> address, and it shouldn't require any userspace daemon to setup or
>> tear down. Ideally with this solution the virtio bypass will come up
>> and be assigned the name of the original virtio, and the "backup"
>> interface will come up and be assigned the name of the original virtio
>> with an additional "nbackup" tacked on via the phys_port_name, and
>> then whenever a VF is added it will automatically be enslaved by the
>> bypass interface, and it will be removed when the VF is hotplugged
>> out.
>>
>> In my mind the difference between this and bond or team is where the
>> configuration interface lies. In the case of bond it is in the kernel.
>> If my understanding is correct team is mostly in user space. With this
>> the configuration interface is really down in the hypervisor and
>> requests are communicated up to the guest. I would prefer not to make
>> virtio_net dependent on the bonding or team drivers, or worse yet a
>> userspace daemon in the guest. For now I would argue we should keep
>> this as simple as possible just to support basic live migration. There
>> has already been discussions of refactoring this after it is in so
>> that we can start to combine the functionality here with what is there
>> in bonding/team, but the differences in configuration interface and
>> the size of the code bases will make it challenging to outright merge
>> this into something like that.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-20 17:23 UTC (permalink / raw)
To: Jiri Pirko
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Sridhar Samudrala, virtualization, Siwei Liu,
Netdev, David Miller
In-Reply-To: <20180220162933.GD2031@nanopsycho>
On Tue, Feb 20, 2018 at 8:29 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>>On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>>>Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>>>used by hypervisor to indicate that virtio_net interface should act as
>>>>a backup for another device with the same MAC address.
>>>>
>>>>Ppatch 2 is in response to the community request for a 3 netdev
>>>>solution. However, it creates some issues we'll get into in a moment.
>>>>It extends virtio_net to use alternate datapath when available and
>>>>registered. When BACKUP feature is enabled, virtio_net driver creates
>>>>an additional 'bypass' netdev that acts as a master device and controls
>>>>2 slave devices. The original virtio_net netdev is registered as
>>>>'backup' netdev and a passthru/vf device with the same MAC gets
>>>>registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>>associated with the same 'pci' device. The user accesses the network
>>>>interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>>as default for transmits when it is available with link up and running.
>>>
>>> Sorry, but this is ridiculous. You are apparently re-implemeting part
>>> of bonding driver as a part of NIC driver. Bond and team drivers
>>> are mature solutions, well tested, broadly used, with lots of issues
>>> resolved in the past. What you try to introduce is a weird shortcut
>>> that already has couple of issues as you mentioned and will certanly
>>> have many more. Also, I'm pretty sure that in future, someone comes up
>>> with ideas like multiple VFs, LACP and similar bonding things.
>>
>>The problem with the bond and team drivers is they are too large and
>>have too many interfaces available for configuration so as a result
>>they can really screw this interface up.
>
> What? Too large is which sense? Why "too many interfaces" is a problem?
> Also, team has only one interface to userspace team-generic-netlink.
Specifically I was working with bond. I had overlooked team for the
most part since it required an additional userspace daemon which
basically broke our requirement of no user-space intervention.
I was trying to focus on just doing an active/backup setup. The
problem is there are debugfs, sysfs, and procfs interfaces exposed
that we don't need and/or want. Adding any sort of interface to
exclude these would just bloat up the bonding driver, and leaving them
in would just be confusing since they would all need to be ignored. In
addition the steps needed to get the name to come out the same as the
original virtio interface would just bloat up bonding.
>>
>>Essentially this is meant to be a bond that is more-or-less managed by
>>the host, not the guest. We want the host to be able to configure it
>
> How is it managed by the host? In your usecase the guest has 2 netdevs:
> virtio_net, pci vf.
> I don't see how host can do any managing of that, other than the
> obvious. But still, the active/backup decision is done in guest. This is
> a simple bond/team usecase. As I said, there is something needed to be
> implemented in userspace in order to handle re-appear of vf netdev.
> But that should be fairly easy to do in teamd.
>
>
>>and have it automatically kick in on the guest. For now we want to
>>avoid adding too much complexity as this is meant to be just the first
>
> That's what I fear, "for now"..
I used the expression "for now" as I see this being the first stage of
a multi-stage process.
Step 1 is to get a basic virtio-bypass driver added to virtio so that
it is at least comparable to netvsc in terms of feature set and
enables basic network live migration.
Step 2 is adding some sort of dirty page tracking, preferably via
something like a paravirtual iommu interface. Once we have that we can
defer the eviction of the VF until the very last moment of the live
migration. For now I need to work on testing a modification to allow
mapping the entire guest as being pass-through for DMA to the device,
and requiring dynamic for any DMA that is bidirectional or from the
device.
Step 3 will be to start looking at advanced configuration. That is
where we drop the implementation in step 1 and instead look at
spawning something that looks more like the team type interface,
however instead of working with a user-space daemon we would likely
need to work with some sort of mailbox or message queue coming up from
the hypervisor. Then we can start looking at doing things like passing
up blocks of eBPF code to handle Tx port selection or whatever we
need.
>
>>step. Trying to go in and implement the whole solution right from the
>>start based on existing drivers is going to be a massive time sink and
>>will likely never get completed due to the fact that there is always
>>going to be some other thing that will interfere.
>
> "implement the whole solution right from the start based on existing
> drivers" - what solution are you talking about? I don't understand this
> para.
You started mentioning much more complex configurations such as
multi-VF, LACP, and other such things. I fully own that this cannot
support that. My understanding is that the netvsc solution that is out
there cannot support anything like that either. The idea for now is to
keep this as simple as possible. It makes things like the possibility
of porting this to other OSes much easier.
>>
>>My personal hope is that we can look at doing a virtio-bond sort of
>>device that will handle all this as well as providing a communication
>>channel, but that is much further down the road. For now we only have
>>a single bit so the goal for now is trying to keep this as simple as
>>possible.
>
> Oh. So there is really intention to do re-implementation of bonding
> in virtio. That is plain-wrong in my opinion.
>
> Could you just use bond/team, please, and don't reinvent the wheel with
> this abomination?
So I have a question for you. Why did you create the team driver? The
bonding code was already there and does almost exactly the same thing.
I would think it has to do with where things are managed. That is the
same situation we have with this.
In my mind I don't see this something where we can just fit it into
one of these two drivers because of the same reason the bonding and
team drivers are split. We want to manage this interface somewhere
else. In my mind what we probably need to do is look at refactoring
the code since the control paths are in different locations for each
of these drivers, but much of the datapath is the same. That is where
I see things going eventually for this "virtio-bond" interface I
referenced, but for now this interface is not that since there isn't
really any communication channel present at all.
>>
>>> What is the reason for this abomination? According to:
>>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>> The reason is quite weak.
>>> User in the vm sees 2 (or more) netdevices, he puts them in bond/team
>>> and that's it. This works now! If the vm lacks some userspace features,
>>> let's fix it there! For example the MAC changes is something that could
>>> be easily handled in teamd userspace deamon.
>>
>>I think you might have missed the point of this. This is meant to be a
>>simple interface so the guest should not be able to change the MAC
>>address, and it shouldn't require any userspace daemon to setup or
>>tear down. Ideally with this solution the virtio bypass will come up
>>and be assigned the name of the original virtio, and the "backup"
>>interface will come up and be assigned the name of the original virtio
>>with an additional "nbackup" tacked on via the phys_port_name, and
>>then whenever a VF is added it will automatically be enslaved by the
>>bypass interface, and it will be removed when the VF is hotplugged
>>out.
>>
>>In my mind the difference between this and bond or team is where the
>>configuration interface lies. In the case of bond it is in the kernel.
>>If my understanding is correct team is mostly in user space. With this
>>the configuration interface is really down in the hypervisor and
>>requests are communicated up to the guest. I would prefer not to make
>>virtio_net dependent on the bonding or team drivers, or worse yet a
>>userspace daemon in the guest. For now I would argue we should keep
>>this as simple as possible just to support basic live migration. There
>>has already been discussions of refactoring this after it is in so
>>that we can start to combine the functionality here with what is there
>>in bonding/team, but the differences in configuration interface and
>>the size of the code bases will make it challenging to outright merge
>>this into something like that.
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-20 19:53 UTC (permalink / raw)
To: Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Sridhar Samudrala, virtualization, Siwei Liu,
Netdev, David Miller
In-Reply-To: <CAKgT0UehiC9O6JW9kjc+h9oHiHnCsqfExzawq+N51yV2tb=zwA@mail.gmail.com>
Tue, Feb 20, 2018 at 06:23:49PM CET, alexander.duyck@gmail.com wrote:
>On Tue, Feb 20, 2018 at 8:29 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>>>On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>>>>Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>>>>used by hypervisor to indicate that virtio_net interface should act as
>>>>>a backup for another device with the same MAC address.
>>>>>
>>>>>Ppatch 2 is in response to the community request for a 3 netdev
>>>>>solution. However, it creates some issues we'll get into in a moment.
>>>>>It extends virtio_net to use alternate datapath when available and
>>>>>registered. When BACKUP feature is enabled, virtio_net driver creates
>>>>>an additional 'bypass' netdev that acts as a master device and controls
>>>>>2 slave devices. The original virtio_net netdev is registered as
>>>>>'backup' netdev and a passthru/vf device with the same MAC gets
>>>>>registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>>>>associated with the same 'pci' device. The user accesses the network
>>>>>interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>>>>as default for transmits when it is available with link up and running.
>>>>
>>>> Sorry, but this is ridiculous. You are apparently re-implemeting part
>>>> of bonding driver as a part of NIC driver. Bond and team drivers
>>>> are mature solutions, well tested, broadly used, with lots of issues
>>>> resolved in the past. What you try to introduce is a weird shortcut
>>>> that already has couple of issues as you mentioned and will certanly
>>>> have many more. Also, I'm pretty sure that in future, someone comes up
>>>> with ideas like multiple VFs, LACP and similar bonding things.
>>>
>>>The problem with the bond and team drivers is they are too large and
>>>have too many interfaces available for configuration so as a result
>>>they can really screw this interface up.
>>
>> What? Too large is which sense? Why "too many interfaces" is a problem?
>> Also, team has only one interface to userspace team-generic-netlink.
>
>Specifically I was working with bond. I had overlooked team for the
>most part since it required an additional userspace daemon which
>basically broke our requirement of no user-space intervention.
Why? That sound artificial. Why the userspace cannot be part of the
solution?
>
>I was trying to focus on just doing an active/backup setup. The
>problem is there are debugfs, sysfs, and procfs interfaces exposed
>that we don't need and/or want. Adding any sort of interface to
>exclude these would just bloat up the bonding driver, and leaving them
>in would just be confusing since they would all need to be ignored. In
>addition the steps needed to get the name to come out the same as the
>original virtio interface would just bloat up bonding.
Why to you care about "name"? it's a netdev, isn't it all that matters?
The viewpoint of the user inside vm boils down to:
1) I have 2 netdevs
2) One is preferred
3) I setup team on top of them
That's should be it. It is the users responsibility to do it this way.
>
>>>
>>>Essentially this is meant to be a bond that is more-or-less managed by
>>>the host, not the guest. We want the host to be able to configure it
>>
>> How is it managed by the host? In your usecase the guest has 2 netdevs:
>> virtio_net, pci vf.
>> I don't see how host can do any managing of that, other than the
>> obvious. But still, the active/backup decision is done in guest. This is
>> a simple bond/team usecase. As I said, there is something needed to be
>> implemented in userspace in order to handle re-appear of vf netdev.
>> But that should be fairly easy to do in teamd.
>>
>>
>>>and have it automatically kick in on the guest. For now we want to
>>>avoid adding too much complexity as this is meant to be just the first
>>
>> That's what I fear, "for now"..
>
>I used the expression "for now" as I see this being the first stage of
>a multi-stage process.
That is what I fear...
>
>Step 1 is to get a basic virtio-bypass driver added to virtio so that
>it is at least comparable to netvsc in terms of feature set and
>enables basic network live migration.
>
>Step 2 is adding some sort of dirty page tracking, preferably via
>something like a paravirtual iommu interface. Once we have that we can
>defer the eviction of the VF until the very last moment of the live
>migration. For now I need to work on testing a modification to allow
>mapping the entire guest as being pass-through for DMA to the device,
>and requiring dynamic for any DMA that is bidirectional or from the
>device.
That is purely on the host side. Does not really matter if your solution
or standard bond/team is in use, right?
>
>Step 3 will be to start looking at advanced configuration. That is
>where we drop the implementation in step 1 and instead look at
>spawning something that looks more like the team type interface,
>however instead of working with a user-space daemon we would likely
>need to work with some sort of mailbox or message queue coming up from
>the hypervisor. Then we can start looking at doing things like passing
>up blocks of eBPF code to handle Tx port selection or whatever we
>need.
:O
>
>>
>>>step. Trying to go in and implement the whole solution right from the
>>>start based on existing drivers is going to be a massive time sink and
>>>will likely never get completed due to the fact that there is always
>>>going to be some other thing that will interfere.
>>
>> "implement the whole solution right from the start based on existing
>> drivers" - what solution are you talking about? I don't understand this
>> para.
>
>You started mentioning much more complex configurations such as
>multi-VF, LACP, and other such things. I fully own that this cannot
>support that. My understanding is that the netvsc solution that is out
>there cannot support anything like that either. The idea for now is to
>keep this as simple as possible. It makes things like the possibility
>of porting this to other OSes much easier.
Easier solution is team and teamd with linimal modifications in order to
make your usecase work. Btw, do you have the needs for your usecase
written down somewhere, so we are on the same page?
>
>>>
>>>My personal hope is that we can look at doing a virtio-bond sort of
>>>device that will handle all this as well as providing a communication
>>>channel, but that is much further down the road. For now we only have
>>>a single bit so the goal for now is trying to keep this as simple as
>>>possible.
>>
>> Oh. So there is really intention to do re-implementation of bonding
>> in virtio. That is plain-wrong in my opinion.
>>
>> Could you just use bond/team, please, and don't reinvent the wheel with
>> this abomination?
>
>So I have a question for you. Why did you create the team driver? The
>bonding code was already there and does almost exactly the same thing.
Please do go down the git log memory lane. Team was introduced in 2011.
At that time bonding was not in a good shape. I decided to rewrite it
with minimal parts being in kernel to allow the flexibility user needs
to be done in userspace. By the way, the usecase you are trying to
resolve by this patchset is something that can benefit from the team
driver kernel-userspace architecture. Easily.
>I would think it has to do with where things are managed. That is the
>same situation we have with this.
>
>In my mind I don't see this something where we can just fit it into
>one of these two drivers because of the same reason the bonding and
>team drivers are split. We want to manage this interface somewhere
>else. In my mind what we probably need to do is look at refactoring
>the code since the control paths are in different locations for each
>of these drivers, but much of the datapath is the same. That is where
This is where you try to twist the universe, in my opinion. You want to
move the responsibilities of the user inside the guest to the user
ourside it. And you use this weird mechanisms to do so. It feel very
wrong. Look at it from non-virtualized point of view. It's like you
would have HW that would configure the kernel which runs on it. It is
supposed to be the other way around.
>I see things going eventually for this "virtio-bond" interface I
>referenced, but for now this interface is not that since there isn't
>really any communication channel present at all.
That is exactly what I fear of. Thanks for ensuring me that is the final
vision :/
>
>>>
>>>> What is the reason for this abomination? According to:
>>>> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>>> The reason is quite weak.
>>>> User in the vm sees 2 (or more) netdevices, he puts them in bond/team
>>>> and that's it. This works now! If the vm lacks some userspace features,
>>>> let's fix it there! For example the MAC changes is something that could
>>>> be easily handled in teamd userspace deamon.
>>>
>>>I think you might have missed the point of this. This is meant to be a
>>>simple interface so the guest should not be able to change the MAC
>>>address, and it shouldn't require any userspace daemon to setup or
>>>tear down. Ideally with this solution the virtio bypass will come up
>>>and be assigned the name of the original virtio, and the "backup"
>>>interface will come up and be assigned the name of the original virtio
>>>with an additional "nbackup" tacked on via the phys_port_name, and
>>>then whenever a VF is added it will automatically be enslaved by the
>>>bypass interface, and it will be removed when the VF is hotplugged
>>>out.
>>>
>>>In my mind the difference between this and bond or team is where the
>>>configuration interface lies. In the case of bond it is in the kernel.
>>>If my understanding is correct team is mostly in user space. With this
>>>the configuration interface is really down in the hypervisor and
>>>requests are communicated up to the guest. I would prefer not to make
>>>virtio_net dependent on the bonding or team drivers, or worse yet a
>>>userspace daemon in the guest. For now I would argue we should keep
>>>this as simple as possible just to support basic live migration. There
>>>has already been discussions of refactoring this after it is in so
>>>that we can start to combine the functionality here with what is there
>>>in bonding/team, but the differences in configuration interface and
>>>the size of the code bases will make it challenging to outright merge
>>>this into something like that.
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-20 20:14 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Netdev, Alexander Duyck, virtualization,
Siwei Liu, David Miller
In-Reply-To: <509bbbf9-4db7-dc78-a05e-403452a7407a@intel.com>
Tue, Feb 20, 2018 at 06:14:32PM CET, sridhar.samudrala@intel.com wrote:
>On 2/20/2018 8:29 AM, Jiri Pirko wrote:
>> Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>> > On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> > > Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>> > > > Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>> > > > used by hypervisor to indicate that virtio_net interface should act as
>> > > > a backup for another device with the same MAC address.
>> > > >
>> > > > Ppatch 2 is in response to the community request for a 3 netdev
>> > > > solution. However, it creates some issues we'll get into in a moment.
>> > > > It extends virtio_net to use alternate datapath when available and
>> > > > registered. When BACKUP feature is enabled, virtio_net driver creates
>> > > > an additional 'bypass' netdev that acts as a master device and controls
>> > > > 2 slave devices. The original virtio_net netdev is registered as
>> > > > 'backup' netdev and a passthru/vf device with the same MAC gets
>> > > > registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>> > > > associated with the same 'pci' device. The user accesses the network
>> > > > interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>> > > > as default for transmits when it is available with link up and running.
>> > > Sorry, but this is ridiculous. You are apparently re-implemeting part
>> > > of bonding driver as a part of NIC driver. Bond and team drivers
>> > > are mature solutions, well tested, broadly used, with lots of issues
>> > > resolved in the past. What you try to introduce is a weird shortcut
>> > > that already has couple of issues as you mentioned and will certanly
>> > > have many more. Also, I'm pretty sure that in future, someone comes up
>> > > with ideas like multiple VFs, LACP and similar bonding things.
>> > The problem with the bond and team drivers is they are too large and
>> > have too many interfaces available for configuration so as a result
>> > they can really screw this interface up.
>> What? Too large is which sense? Why "too many interfaces" is a problem?
>> Also, team has only one interface to userspace team-generic-netlink.
>>
>>
>> > Essentially this is meant to be a bond that is more-or-less managed by
>> > the host, not the guest. We want the host to be able to configure it
>> How is it managed by the host? In your usecase the guest has 2 netdevs:
>> virtio_net, pci vf.
>> I don't see how host can do any managing of that, other than the
>> obvious. But still, the active/backup decision is done in guest. This is
>> a simple bond/team usecase. As I said, there is something needed to be
>> implemented in userspace in order to handle re-appear of vf netdev.
>> But that should be fairly easy to do in teamd.
>
>The host manages the active/backup decision by
>- assigning the same MAC address to both VF and virtio interfaces
>- setting a BACKUP feature bit on virtio that enables virtio to transparently
>take
> over the VFs datapath.
>- only enable one datapath at anytime so that packets don't get looped back
>- during live migration enable virtio datapth, unplug vf on the source and
>replug
> vf on the destination.
>
>The VM is not expected and doesn't have any control of setting the MAC
>address
>or bringing up/down the links.
>
>This is the model that is currently supported with netvsc driver on Azure.
Yeah, I can see it now :( I guess that the ship has sailed and we are
stuck with this ugly thing forever...
Could you at least make some common code that is shared in between
netvsc and virtio_net so this is handled in exacly the same way in both?
The fact that the netvsc/virtio_net kidnaps a netdev only because it
has the same mac is going to give me some serious nighmares...
I think we need to introduce some more strict checks.
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Alexander Duyck @ 2018-02-20 21:02 UTC (permalink / raw)
To: Jiri Pirko
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
Jakub Kicinski, Samudrala, Sridhar, virtualization, Siwei Liu,
Netdev, David Miller
In-Reply-To: <20180220201410.GF2031@nanopsycho>
On Tue, Feb 20, 2018 at 12:14 PM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Feb 20, 2018 at 06:14:32PM CET, sridhar.samudrala@intel.com wrote:
>>On 2/20/2018 8:29 AM, Jiri Pirko wrote:
>>> Tue, Feb 20, 2018 at 05:04:29PM CET, alexander.duyck@gmail.com wrote:
>>> > On Tue, Feb 20, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> > > Fri, Feb 16, 2018 at 07:11:19PM CET, sridhar.samudrala@intel.com wrote:
>>> > > > Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
>>> > > > used by hypervisor to indicate that virtio_net interface should act as
>>> > > > a backup for another device with the same MAC address.
>>> > > >
>>> > > > Ppatch 2 is in response to the community request for a 3 netdev
>>> > > > solution. However, it creates some issues we'll get into in a moment.
>>> > > > It extends virtio_net to use alternate datapath when available and
>>> > > > registered. When BACKUP feature is enabled, virtio_net driver creates
>>> > > > an additional 'bypass' netdev that acts as a master device and controls
>>> > > > 2 slave devices. The original virtio_net netdev is registered as
>>> > > > 'backup' netdev and a passthru/vf device with the same MAC gets
>>> > > > registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
>>> > > > associated with the same 'pci' device. The user accesses the network
>>> > > > interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
>>> > > > as default for transmits when it is available with link up and running.
>>> > > Sorry, but this is ridiculous. You are apparently re-implemeting part
>>> > > of bonding driver as a part of NIC driver. Bond and team drivers
>>> > > are mature solutions, well tested, broadly used, with lots of issues
>>> > > resolved in the past. What you try to introduce is a weird shortcut
>>> > > that already has couple of issues as you mentioned and will certanly
>>> > > have many more. Also, I'm pretty sure that in future, someone comes up
>>> > > with ideas like multiple VFs, LACP and similar bonding things.
>>> > The problem with the bond and team drivers is they are too large and
>>> > have too many interfaces available for configuration so as a result
>>> > they can really screw this interface up.
>>> What? Too large is which sense? Why "too many interfaces" is a problem?
>>> Also, team has only one interface to userspace team-generic-netlink.
>>>
>>>
>>> > Essentially this is meant to be a bond that is more-or-less managed by
>>> > the host, not the guest. We want the host to be able to configure it
>>> How is it managed by the host? In your usecase the guest has 2 netdevs:
>>> virtio_net, pci vf.
>>> I don't see how host can do any managing of that, other than the
>>> obvious. But still, the active/backup decision is done in guest. This is
>>> a simple bond/team usecase. As I said, there is something needed to be
>>> implemented in userspace in order to handle re-appear of vf netdev.
>>> But that should be fairly easy to do in teamd.
>>
>>The host manages the active/backup decision by
>>- assigning the same MAC address to both VF and virtio interfaces
>>- setting a BACKUP feature bit on virtio that enables virtio to transparently
>>take
>> over the VFs datapath.
>>- only enable one datapath at anytime so that packets don't get looped back
>>- during live migration enable virtio datapth, unplug vf on the source and
>>replug
>> vf on the destination.
>>
>>The VM is not expected and doesn't have any control of setting the MAC
>>address
>>or bringing up/down the links.
>>
>>This is the model that is currently supported with netvsc driver on Azure.
>
> Yeah, I can see it now :( I guess that the ship has sailed and we are
> stuck with this ugly thing forever...
>
> Could you at least make some common code that is shared in between
> netvsc and virtio_net so this is handled in exacly the same way in both?
>
> The fact that the netvsc/virtio_net kidnaps a netdev only because it
> has the same mac is going to give me some serious nighmares...
> I think we need to introduce some more strict checks.
In order for that to work we need to settle on a model for these. The
issue is that netvsc is using what we refer to as the "2 netdev" model
where they don't expose the paravirtual interface as its own netdev.
The opinion of Jakub and others has been that we should do a "3
netdev" model in the case of virtio_net since otherwise we will lose
functionality such as in-driver XDP and have to deal with an extra set
of qdiscs and Tx queue locks on transmit path.
Really at this point I am good either way, but we need to probably
have Stephen, Jakub, and whoever else had an opinion on the matter
sort out the 2 vs 3 argument before we could proceed on that. Most of
patch 2 in the set can easily be broken out into a separate file later
if we decide to go that route.
Thanks.
- Alex
^ permalink raw reply
* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jakub Kicinski @ 2018-02-20 22:33 UTC (permalink / raw)
To: Jiri Pirko, Samudrala, Sridhar, Alexander Duyck
Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin, Netdev,
virtualization, Siwei Liu, David Miller
In-Reply-To: <20180220201410.GF2031@nanopsycho>
On Tue, 20 Feb 2018 21:14:10 +0100, Jiri Pirko wrote:
> Yeah, I can see it now :( I guess that the ship has sailed and we are
> stuck with this ugly thing forever...
>
> Could you at least make some common code that is shared in between
> netvsc and virtio_net so this is handled in exacly the same way in both?
IMHO netvsc is a vendor specific driver which made a mistake on what
behaviour it provides (or tried to align itself with Windows SR-IOV).
Let's not make a far, far more commonly deployed and important driver
(virtio) bug-compatible with netvsc.
To Jiri's initial comments, I feel the same way, in fact I've talked to
the NetworkManager guys to get auto-bonding based on MACs handled in
user space. I think it may very well get done in next versions of NM,
but isn't done yet. Stephen also raised the point that not everybody is
using NM.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox