* [PATCH 2/2] qxl: keep separate release_bo pointer
From: Gerd Hoffmann @ 2018-04-18 5:42 UTC (permalink / raw)
To: dri-devel
Cc: David Airlie, Dave Airlie, open list,
open list:DRM DRIVER FOR QXL VIRTUAL GPU
In-Reply-To: <20180418054257.15388-1-kraxel@redhat.com>
qxl expects that list_first_entry(release->bos) returns the first
element qxl added to the list. ttm_eu_reserve_buffers() may reorder
the list though.
Add a release_bo field to struct qxl_release and use that instead.
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
drivers/gpu/drm/qxl/qxl_drv.h | 1 +
drivers/gpu/drm/qxl/qxl_cmd.c | 6 ++----
drivers/gpu/drm/qxl/qxl_release.c | 12 ++++++------
3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/qxl/qxl_drv.h b/drivers/gpu/drm/qxl/qxl_drv.h
index 00a1a66b05..864b456080 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.h
+++ b/drivers/gpu/drm/qxl/qxl_drv.h
@@ -167,6 +167,7 @@ struct qxl_release {
int id;
int type;
+ struct qxl_bo *release_bo;
uint32_t release_offset;
uint32_t surface_release_id;
struct ww_acquire_ctx ticket;
diff --git a/drivers/gpu/drm/qxl/qxl_cmd.c b/drivers/gpu/drm/qxl/qxl_cmd.c
index c0fb52c6d4..01665b98c5 100644
--- a/drivers/gpu/drm/qxl/qxl_cmd.c
+++ b/drivers/gpu/drm/qxl/qxl_cmd.c
@@ -179,10 +179,9 @@ qxl_push_command_ring_release(struct qxl_device *qdev, struct qxl_release *relea
uint32_t type, bool interruptible)
{
struct qxl_command cmd;
- struct qxl_bo_list *entry = list_first_entry(&release->bos, struct qxl_bo_list, tv.head);
cmd.type = type;
- cmd.data = qxl_bo_physical_address(qdev, to_qxl_bo(entry->tv.bo), release->release_offset);
+ cmd.data = qxl_bo_physical_address(qdev, release->release_bo, release->release_offset);
return qxl_ring_push(qdev->command_ring, &cmd, interruptible);
}
@@ -192,10 +191,9 @@ qxl_push_cursor_ring_release(struct qxl_device *qdev, struct qxl_release *releas
uint32_t type, bool interruptible)
{
struct qxl_command cmd;
- struct qxl_bo_list *entry = list_first_entry(&release->bos, struct qxl_bo_list, tv.head);
cmd.type = type;
- cmd.data = qxl_bo_physical_address(qdev, to_qxl_bo(entry->tv.bo), release->release_offset);
+ cmd.data = qxl_bo_physical_address(qdev, release->release_bo, release->release_offset);
return qxl_ring_push(qdev->cursor_ring, &cmd, interruptible);
}
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index a0b4244d28..7cb2145772 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -173,6 +173,7 @@ qxl_release_free_list(struct qxl_release *release)
list_del(&entry->tv.head);
kfree(entry);
}
+ release->release_bo = NULL;
}
void
@@ -296,7 +297,6 @@ int qxl_alloc_surface_release_reserved(struct qxl_device *qdev,
{
if (surface_cmd_type == QXL_SURFACE_CMD_DESTROY && create_rel) {
int idr_ret;
- struct qxl_bo_list *entry = list_first_entry(&create_rel->bos, struct qxl_bo_list, tv.head);
struct qxl_bo *bo;
union qxl_release_info *info;
@@ -304,8 +304,9 @@ int qxl_alloc_surface_release_reserved(struct qxl_device *qdev,
idr_ret = qxl_release_alloc(qdev, QXL_RELEASE_SURFACE_CMD, release);
if (idr_ret < 0)
return idr_ret;
- bo = to_qxl_bo(entry->tv.bo);
+ bo = create_rel->release_bo;
+ (*release)->release_bo = bo;
(*release)->release_offset = create_rel->release_offset + 64;
qxl_release_list_add(*release, bo);
@@ -365,6 +366,7 @@ int qxl_alloc_release_reserved(struct qxl_device *qdev, unsigned long size,
bo = qxl_bo_ref(qdev->current_release_bo[cur_idx]);
+ (*release)->release_bo = bo;
(*release)->release_offset = qdev->current_release_bo_offset[cur_idx] * release_size_per_bo[cur_idx];
qdev->current_release_bo_offset[cur_idx]++;
@@ -408,8 +410,7 @@ union qxl_release_info *qxl_release_map(struct qxl_device *qdev,
{
void *ptr;
union qxl_release_info *info;
- struct qxl_bo_list *entry = list_first_entry(&release->bos, struct qxl_bo_list, tv.head);
- struct qxl_bo *bo = to_qxl_bo(entry->tv.bo);
+ struct qxl_bo *bo = release->release_bo;
ptr = qxl_bo_kmap_atomic_page(qdev, bo, release->release_offset & PAGE_MASK);
if (!ptr)
@@ -422,8 +423,7 @@ void qxl_release_unmap(struct qxl_device *qdev,
struct qxl_release *release,
union qxl_release_info *info)
{
- struct qxl_bo_list *entry = list_first_entry(&release->bos, struct qxl_bo_list, tv.head);
- struct qxl_bo *bo = to_qxl_bo(entry->tv.bo);
+ struct qxl_bo *bo = release->release_bo;
void *ptr;
ptr = ((void *)info) - (release->release_offset & ~PAGE_MASK);
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Xin Long @ 2018-04-18 6:57 UTC (permalink / raw)
To: Marcelo Ricardo Leitner
Cc: Neil Horman, Michael S. Tsirkin, network dev, Vladislav Yasevich,
virtualization, linux-sctp
In-Reply-To: <20180418013333.GO4716@localhost.localdomain>
On Wed, Apr 18, 2018 at 9:33 AM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> On Tue, Apr 17, 2018 at 04:35:18PM -0400, Vlad Yasevich wrote:
>> On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
>> > On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
>> >> Now that we have SCTP offload capabilities in the kernel, we can add
>> >> them to virtio as well. First step is SCTP checksum.
>> >
>> > Thanks.
>> >
>> >> As for GSO, the way sctp GSO is currently implemented buys us nothing
>> >> in added support to virtio. To add true GSO, would require a lot of
>> >> re-work inside of SCTP and would require extensions to the virtio
>> >> net header to carry extra sctp data.
>> >
>> > Can you please elaborate more on this? Is this because SCTP GSO relies
>> > on the gso skb format for knowing how to segment it instead of having
>> > a list of sizes?
>> >
>>
>> it's mainly because all the true segmentation, placing data into chunks,
>> has already happened. All that GSO does is allow for higher bundling
>> rate between VMs. If that is all SCTP GSO ever going to do, that fine,
>> but the goal is to do real GSO eventually and potentially reduce the
>> amount of memory copying we are doing.
The memory copying for bundling can't be avoided, as the chunks in
one packet may be from some different messages, as Marcelo said below.
>> If we do that, any current attempt at GSO in virtio would have to be
>> depricated and we'd need GSO2 or something like that.
Why would it be depreciated? virtio_net actually also supports frag_list,
all we need to do is to enable it.
I already have a patch for sctp gso over virtio_net in my local tree:
patch on qemu-2.9.0(el7):
https://paste.fedoraproject.org/paste/IgnSbW74L6P9aUZpnWHBsA
patch on kernel-4.16(linus):
https://paste.fedoraproject.org/paste/L7AhFIVVp3k8VfEV29QMiw
The performance has been improved quite a lot with this:
testing over virtio-net(guest):
https://paste.fedoraproject.org/paste/cxQMnrDQDf9AoNgBpA-twA
testing over tap(host):
https://paste.fedoraproject.org/paste/QPdc22pgvF-x68oKDjWa9g
>>
>> This is why, after doing the GSO support, I decided not to include it.
>
> Gotcha. I don't think it will ever go further than what we have now.
> Placing data into chunks later is not really feasible/wanted,
> especially now with stream schedulers and idata chunks. Doesn't seem
> worth the hassle... we would have to support things like, "segment
> half of this message plus a third of this other one from that other
> stream." (in case it's round robin).
>
> Marcelo
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-18 9:25 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <6a8c1ff5-153a-e40a-91b3-48532b8d3a38@intel.com>
Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
>On 4/11/2018 8:51 AM, Jiri Pirko wrote:
>> Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>> > This provides a generic interface for paravirtual drivers to listen
>> > for netdev register/unregister/link change events from pci ethernet
>> > devices with the same MAC and takeover their datapath. The notifier and
>> > event handling code is based on the existing netvsc implementation.
>> >
>> > It exposes 2 sets of interfaces to the paravirtual drivers.
>> > 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> > master netdev is created. The paravirtual driver registers each bypass
>> > instance along with a set of ops to manage the slave events.
>> > bypass_master_register()
>> > bypass_master_unregister()
>> > 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> > the bypass module provides interfaces to create/destroy additional master
>> > netdev and all the slave events are managed internally.
>> > bypass_master_create()
>> > bypass_master_destroy()
>> >
>> > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> > ---
>> > include/linux/netdevice.h | 14 +
>> > include/net/bypass.h | 96 ++++++
>> > net/Kconfig | 18 +
>> > net/core/Makefile | 1 +
>> > net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>> > 5 files changed, 973 insertions(+)
>> > create mode 100644 include/net/bypass.h
>> > create mode 100644 net/core/bypass.c
>> >
>> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> > index cf44503ea81a..587293728f70 100644
>> > --- a/include/linux/netdevice.h
>> > +++ b/include/linux/netdevice.h
>> > @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>> > IFF_PHONY_HEADROOM = 1<<24,
>> > IFF_MACSEC = 1<<25,
>> > IFF_NO_RX_HANDLER = 1<<26,
>> > + IFF_BYPASS = 1 << 27,
>> > + IFF_BYPASS_SLAVE = 1 << 28,
>> I wonder, why you don't follow the existing coding style... Also, please
>> add these to into the comment above.
>
>To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
>to the existing coding style to be consistent.
Please do.
>
>>
>>
>> > };
>> >
>> > #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
>> > @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>> > #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
>> > #define IFF_MACSEC IFF_MACSEC
>> > #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
>> > +#define IFF_BYPASS IFF_BYPASS
>> > +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
>> >
>> > /**
>> > * struct net_device - The DEVICE structure.
>> > @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>> > return dev->priv_flags & IFF_RXFH_CONFIGURED;
>> > }
>> >
>> > +static inline bool netif_is_bypass_master(const struct net_device *dev)
>> > +{
>> > + return dev->priv_flags & IFF_BYPASS;
>> > +}
>> > +
>> > +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>> > +{
>> > + return dev->priv_flags & IFF_BYPASS_SLAVE;
>> > +}
>> > +
>> > /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>> > static inline void netif_keep_dst(struct net_device *dev)
>> > {
>> > diff --git a/include/net/bypass.h b/include/net/bypass.h
>> > new file mode 100644
>> > index 000000000000..86b02cb894cf
>> > --- /dev/null
>> > +++ b/include/net/bypass.h
>> > @@ -0,0 +1,96 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2018, Intel Corporation. */
>> > +
>> > +#ifndef _NET_BYPASS_H
>> > +#define _NET_BYPASS_H
>> > +
>> > +#include <linux/netdevice.h>
>> > +
>> > +struct bypass_ops {
>> > + int (*slave_pre_register)(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev);
>> > + int (*slave_join)(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev);
>> > + int (*slave_pre_unregister)(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev);
>> > + int (*slave_release)(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev);
>> > + int (*slave_link_change)(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev);
>> > + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> > +};
>> > +
>> > +struct bypass_master {
>> > + struct list_head list;
>> > + struct net_device __rcu *bypass_netdev;
>> > + struct bypass_ops __rcu *ops;
>> > +};
>> > +
>> > +/* bypass state */
>> > +struct bypass_info {
>> > + /* passthru netdev with same MAC */
>> > + struct net_device __rcu *active_netdev;
>> You still use "active"/"backup" names which is highly misleading as
>> it has completely different meaning that in bond for example.
>> I noted that in my previous review already. Please change it.
>
>I guess the issue is with only the 'active' name. 'backup' should be fine as it also
>matches with the BACKUP feature bit we are adding to virtio_net.
I think that "backup" is also misleading. Both "active" and "backup"
mean a *state* of slaves. This should be named differently.
>
>With regards to alternate names for 'active', you suggested 'stolen', but i
>am not too happy with it.
>netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
No. The netdev could be any netdevice. It does not have to be a "VF".
I think "stolen" is quite appropriate since it describes the modus
operandi. The bypass master steals some netdevice according to some
match.
But I don't insist on "stolen". Just sounds right.
>
>
>
>>
>>
>> > +
>> > + /* virtio_net netdev */
>> > + struct net_device __rcu *backup_netdev;
>> > +
>> > + /* active netdev stats */
>> > + struct rtnl_link_stats64 active_stats;
>> > +
>> > + /* backup netdev stats */
>> > + struct rtnl_link_stats64 backup_stats;
>> > +
>> > + /* aggregated stats */
>> > + struct rtnl_link_stats64 bypass_stats;
>> > +
>> > + /* spinlock while updating stats */
>> > + spinlock_t stats_lock;
>> > +};
>> > +
>> > +#if IS_ENABLED(CONFIG_NET_BYPASS)
>> > +
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > + struct bypass_master **pbypass_master);
>> > +void bypass_master_destroy(struct bypass_master *bypass_master);
>> > +
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > + struct bypass_master **pbypass_master);
>> > +void bypass_master_unregister(struct bypass_master *bypass_master);
>> > +
>> > +int bypass_slave_unregister(struct net_device *slave_netdev);
>> > +
>> > +#else
>> > +
>> > +static inline
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > + struct bypass_master **pbypass_master);
>> > +{
>> > + return 0;
>> > +}
>> > +
>> > +static inline
>> > +void bypass_master_destroy(struct bypass_master *bypass_master)
>> > +{
>> > +}
>> > +
>> > +static inline
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > + struct pbypass_master **pbypass_master);
>> > +{
>> > + return 0;
>> > +}
>> > +
>> > +static inline
>> > +void bypass_master_unregister(struct bypass_master *bypass_master)
>> > +{
>> > +}
>> > +
>> > +static inline
>> > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> > +{
>> > + return 0;
>> > +}
>> > +
>> > +#endif
>> > +
>> > +#endif /* _NET_BYPASS_H */
>> > diff --git a/net/Kconfig b/net/Kconfig
>> > index 0428f12c25c2..994445f4a96a 100644
>> > --- a/net/Kconfig
>> > +++ b/net/Kconfig
>> > @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>> > on MAY_USE_DEVLINK to ensure they do not cause link errors when
>> > devlink is a loadable module and the driver using it is built-in.
>> >
>> > +config NET_BYPASS
>> > + tristate "Bypass interface"
>> > + ---help---
>> > + This provides a generic interface for paravirtual drivers to listen
>> > + for netdev register/unregister/link change events from pci ethernet
>> > + devices with the same MAC and takeover their datapath. This also
>> > + enables live migration of a VM with direct attached VF by failing
>> > + over to the paravirtual datapath when the VF is unplugged.
>> > +
>> > +config MAY_USE_BYPASS
>> > + tristate
>> > + default m if NET_BYPASS=m
>> > + default y if NET_BYPASS=y || NET_BYPASS=n
>> > + help
>> > + Drivers using the bypass infrastructure should have a dependency
>> > + on MAY_USE_BYPASS to ensure they do not cause link errors when
>> > + bypass is a loadable module and the driver using it is built-in.
>> > +
>> > endif # if NET
>> >
>> > # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> > diff --git a/net/core/Makefile b/net/core/Makefile
>> > index 6dbbba8c57ae..a9727ed1c8fc 100644
>> > --- a/net/core/Makefile
>> > +++ b/net/core/Makefile
>> > @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>> > obj-$(CONFIG_HWBM) += hwbm.o
>> > obj-$(CONFIG_NET_DEVLINK) += devlink.o
>> > obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> > +obj-$(CONFIG_NET_BYPASS) += bypass.o
>> > diff --git a/net/core/bypass.c b/net/core/bypass.c
>> > new file mode 100644
>> > index 000000000000..b5b9cb554c3f
>> > --- /dev/null
>> > +++ b/net/core/bypass.c
>> > @@ -0,0 +1,844 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +/* Copyright (c) 2018, Intel Corporation. */
>> > +
>> > +/* A common module to handle registrations and notifications for paravirtual
>> > + * drivers to enable accelerated datapath and support VF live migration.
>> > + *
>> > + * The notifier and event handling code is based on netvsc driver.
>> > + */
>> > +
>> > +#include <linux/netdevice.h>
>> > +#include <linux/etherdevice.h>
>> > +#include <linux/ethtool.h>
>> > +#include <linux/module.h>
>> > +#include <linux/slab.h>
>> > +#include <linux/netdevice.h>
>> > +#include <linux/netpoll.h>
>> > +#include <linux/rtnetlink.h>
>> > +#include <linux/if_vlan.h>
>> > +#include <linux/pci.h>
>> > +#include <net/sch_generic.h>
>> > +#include <uapi/linux/if_arp.h>
>> > +#include <net/bypass.h>
>> > +
>> > +static LIST_HEAD(bypass_master_list);
>> > +static DEFINE_SPINLOCK(bypass_lock);
>> > +
>> > +static int bypass_slave_pre_register(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev,
>> > + struct bypass_ops *bypass_ops)
>> > +{
>> > + struct bypass_info *bi;
>> > + bool backup;
>> > +
>> > + if (bypass_ops) {
>> > + if (!bypass_ops->slave_pre_register)
>> > + return -EINVAL;
>> > +
>> > + return bypass_ops->slave_pre_register(slave_netdev,
>> > + bypass_netdev);
>> > + }
>> > +
>> > + bi = netdev_priv(bypass_netdev);
>> > + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> > + if (backup ? rtnl_dereference(bi->backup_netdev) :
>> > + rtnl_dereference(bi->active_netdev)) {
>> > + netdev_err(bypass_netdev, "%s attempting to register as slave dev when %s already present\n",
>> > + slave_netdev->name, backup ? "backup" : "active");
>> > + return -EEXIST;
>> > + }
>> > +
>> > + /* Avoid non pci devices as active netdev */
>> > + if (!backup && (!slave_netdev->dev.parent ||
>> > + !dev_is_pci(slave_netdev->dev.parent)))
>> > + return -EINVAL;
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +static int bypass_slave_join(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev,
>> > + struct bypass_ops *bypass_ops)
>> > +{
>> > + struct bypass_info *bi;
>> > + bool backup;
>> > +
>> > + if (bypass_ops) {
>> > + if (!bypass_ops->slave_join)
>> > + return -EINVAL;
>> > +
>> > + return bypass_ops->slave_join(slave_netdev, bypass_netdev);
>> > + }
>> > +
>> > + bi = netdev_priv(bypass_netdev);
>> > + backup = (slave_netdev->dev.parent == bypass_netdev->dev.parent);
>> > +
>> > + dev_hold(slave_netdev);
>> > +
>> > + if (backup) {
>> > + rcu_assign_pointer(bi->backup_netdev, slave_netdev);
>> > + dev_get_stats(bi->backup_netdev, &bi->backup_stats);
>> > + } else {
>> > + rcu_assign_pointer(bi->active_netdev, slave_netdev);
>> > + dev_get_stats(bi->active_netdev, &bi->active_stats);
>> > + bypass_netdev->min_mtu = slave_netdev->min_mtu;
>> > + bypass_netdev->max_mtu = slave_netdev->max_mtu;
>> > + }
>> > +
>> > + netdev_info(bypass_netdev, "bypass slave:%s joined\n",
>> > + slave_netdev->name);
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +/* Called when slave dev is injecting data into network stack.
>> > + * Change the associated network device from lower dev to virtio.
>> > + * note: already called with rcu_read_lock
>> > + */
>> > +static rx_handler_result_t bypass_handle_frame(struct sk_buff **pskb)
>> > +{
>> > + struct sk_buff *skb = *pskb;
>> > + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> > +
>> > + skb->dev = ndev;
>> > +
>> > + return RX_HANDLER_ANOTHER;
>> > +}
>> > +
>> > +static struct net_device *bypass_master_get_bymac(u8 *mac,
>> > + struct bypass_ops **ops)
>> > +{
>> > + struct bypass_master *bypass_master;
>> > + struct net_device *bypass_netdev;
>> > +
>> > + spin_lock(&bypass_lock);
>> > + list_for_each_entry(bypass_master, &bypass_master_list, list) {
>> As I wrote the last time, you don't need this list, spinlock.
>> You can do just something like:
>> for_each_net(net) {
>> for_each_netdev(net, dev) {
>> if (netif_is_bypass_master(dev)) {
>
>This function returns the upper netdev as well as the ops associated
>with that netdev.
>bypass_master_list is a list of 'struct bypass_master' that associates
Well, can't you have it in netdev priv?
>'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
>We need 'ops' only to support the 2 netdev model of netvsc. ops will be
>NULL for 3-netdev model.
I see :(
>
>
>>
>>
>>
>>
>> > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> > + *ops = rcu_dereference(bypass_master->ops);
>> I don't see how rcu_dereference is ok here.
>> 1) I don't see rcu_read_lock taken
>> 2) Looks like bypass_master->ops has the same value across the whole
>> existence.
>
>We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
>Yes. ops doesn't change.
If it does not change, you can just access it directly.
>
>>
>>
>> > + spin_unlock(&bypass_lock);
>> > + return bypass_netdev;
>> > + }
>> > + }
>> > + spin_unlock(&bypass_lock);
>> > + return NULL;
>> > +}
>> > +
>> > +static int bypass_slave_register(struct net_device *slave_netdev)
>> > +{
>> > + struct net_device *bypass_netdev;
>> > + struct bypass_ops *bypass_ops;
>> > + int ret, orig_mtu;
>> > +
>> > + ASSERT_RTNL();
>> > +
>> > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > + &bypass_ops);
>> For master, could you use word "master" in the variables so it is clear?
>> Also, "dev" is fine instead of "netdev".
>> Something like "bpmaster_dev"
>
>bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
I was trying to point out, that "bypass_netdev" represents a "master"
netdev, yet it does not say master. That is why I suggested
"bpmaster_dev"
>I can change all _netdev suffixes to _dev to make the names shorter.
ok.
>
>
>>
>>
>> > + if (!bypass_netdev)
>> > + goto done;
>> > +
>> > + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>> > + bypass_ops);
>> > + if (ret != 0)
>> Just "if (ret)" will do. You have this on more places.
>
>OK.
>
>
>>
>>
>> > + goto done;
>> > +
>> > + ret = netdev_rx_handler_register(slave_netdev,
>> > + bypass_ops ? bypass_ops->handle_frame :
>> > + bypass_handle_frame, bypass_netdev);
>> > + if (ret != 0) {
>> > + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>> > + ret);
>> > + goto done;
>> > + }
>> > +
>> > + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>> > + if (ret != 0) {
>> > + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>> > + bypass_netdev->name, ret);
>> > + goto upper_link_failed;
>> > + }
>> > +
>> > + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>> > +
>> > + if (netif_running(bypass_netdev)) {
>> > + ret = dev_open(slave_netdev);
>> > + if (ret && (ret != -EBUSY)) {
>> > + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>> > + slave_netdev->name, ret);
>> > + goto err_interface_up;
>> > + }
>> > + }
>> > +
>> > + /* Align MTU of slave with master */
>> > + orig_mtu = slave_netdev->mtu;
>> > + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>> > + if (ret != 0) {
>> > + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>> > + slave_netdev->name, bypass_netdev->mtu);
>> > + goto err_set_mtu;
>> > + }
>> > +
>> > + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>> > + if (ret != 0)
>> > + goto err_join;
>> > +
>> > + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>> > +
>> > + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>> > + slave_netdev->name);
>> > +
>> > + goto done;
>> > +
>> > +err_join:
>> > + dev_set_mtu(slave_netdev, orig_mtu);
>> > +err_set_mtu:
>> > + dev_close(slave_netdev);
>> > +err_interface_up:
>> > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > +upper_link_failed:
>> > + netdev_rx_handler_unregister(slave_netdev);
>> > +done:
>> > + return NOTIFY_DONE;
>> > +}
>> > +
>> > +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev,
>> > + struct bypass_ops *bypass_ops)
>> > +{
>> > + struct net_device *backup_netdev, *active_netdev;
>> > + struct bypass_info *bi;
>> > +
>> > + if (bypass_ops) {
>> > + if (!bypass_ops->slave_pre_unregister)
>> > + return -EINVAL;
>> > +
>> > + return bypass_ops->slave_pre_unregister(slave_netdev,
>> > + bypass_netdev);
>> > + }
>> > +
>> > + bi = netdev_priv(bypass_netdev);
>> > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > + return -EINVAL;
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +static int bypass_slave_release(struct net_device *slave_netdev,
>> > + struct net_device *bypass_netdev,
>> > + struct bypass_ops *bypass_ops)
>> > +{
>> > + struct net_device *backup_netdev, *active_netdev;
>> > + struct bypass_info *bi;
>> > +
>> > + if (bypass_ops) {
>> > + if (!bypass_ops->slave_release)
>> > + return -EINVAL;
>> I think it would be good to make the API to the driver more strict and
>> have a separate set of ops for "active" and "backup" netdevices.
>> That should stop people thinking about extending this to more slaves in
>> the future.
>
>We have checks in slave_pre_register() that allows only 1 'backup' and 1
>'active' slave.
I'm very well aware of that. I just thought that explicit ops for the
two slaves would make this more clear.
>
>
>>
>>
>>
>> > +
>> > + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>> > + }
>> > +
>> > + bi = netdev_priv(bypass_netdev);
>> > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > + if (slave_netdev == backup_netdev) {
>> > + RCU_INIT_POINTER(bi->backup_netdev, NULL);
>> > + } else {
>> > + RCU_INIT_POINTER(bi->active_netdev, NULL);
>> > + if (backup_netdev) {
>> > + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> > + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> > + }
>> > + }
>> > +
>> > + dev_put(slave_netdev);
>> > +
>> > + netdev_info(bypass_netdev, "bypass slave:%s released\n",
>> > + slave_netdev->name);
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> > +{
>> > + struct net_device *bypass_netdev;
>> > + struct bypass_ops *bypass_ops;
>> > + int ret;
>> > +
>> > + if (!netif_is_bypass_slave(slave_netdev))
>> > + goto done;
>> > +
>> > + ASSERT_RTNL();
>> > +
>> > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > + &bypass_ops);
>> > + if (!bypass_netdev)
>> > + goto done;
>> > +
>> > + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>> > + bypass_ops);
>> > + if (ret != 0)
>> > + goto done;
>> > +
>> > + netdev_rx_handler_unregister(slave_netdev);
>> > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > +
>> > + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>> > +
>> > + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>> > + slave_netdev->name);
>> > +
>> > +done:
>> > + return NOTIFY_DONE;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>> > +
>> > +static bool bypass_xmit_ready(struct net_device *dev)
>> > +{
>> > + return netif_running(dev) && netif_carrier_ok(dev);
>> > +}
>> > +
>> > +static int bypass_slave_link_change(struct net_device *slave_netdev)
>> > +{
>> > + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>> > + struct bypass_ops *bypass_ops;
>> > + struct bypass_info *bi;
>> > +
>> > + if (!netif_is_bypass_slave(slave_netdev))
>> > + goto done;
>> > +
>> > + ASSERT_RTNL();
>> > +
>> > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > + &bypass_ops);
>> > + if (!bypass_netdev)
>> > + goto done;
>> > +
>> > + if (bypass_ops) {
>> > + if (!bypass_ops->slave_link_change)
>> > + goto done;
>> > +
>> > + return bypass_ops->slave_link_change(slave_netdev,
>> > + bypass_netdev);
>> > + }
>> > +
>> > + if (!netif_running(bypass_netdev))
>> > + return 0;
>> > +
>> > + bi = netdev_priv(bypass_netdev);
>> > +
>> > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > +
>> > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > + goto done;
>> You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
>> above is enough.
>
>I think we need this check to not allow events from a slave that is not
>attached to this master but has the same MAC.
Why do we need such events? Seems wrong to me. Consider:
bp1 bp2
a1 b1 a2 b2
a1 and a2 have the same mac and bp1 and bp2 have the same mac.
Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
the order of creation.
Let's say it will return bp1. Then when we have event for a2, the
bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.
You cannot use bypass_master_get_bymac() here.
>
>>
>>
>> > +
>> > + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>> > + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>> > + netif_carrier_on(bypass_netdev);
>> > + netif_tx_wake_all_queues(bypass_netdev);
>> > + } else {
>> > + netif_carrier_off(bypass_netdev);
>> > + netif_tx_stop_all_queues(bypass_netdev);
>> > + }
>> > +
>> > +done:
>> > + return NOTIFY_DONE;
>> > +}
>> > +
>> > +static bool bypass_validate_event_dev(struct net_device *dev)
>> > +{
>> > + /* Skip parent events */
>> > + if (netif_is_bypass_master(dev))
>> > + return false;
>> > +
>> > + /* Avoid non-Ethernet type devices */
>> > + if (dev->type != ARPHRD_ETHER)
>> > + return false;
>> > +
>> > + /* Avoid Vlan dev with same MAC registering as VF */
>> > + if (is_vlan_dev(dev))
>> > + return false;
>> > +
>> > + /* Avoid Bonding master dev with same MAC registering as slave dev */
>> > + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> Yeah, this is certainly incorrect. One thing is, you should be using the
>> helpers netif_is_bond_master().
>> But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>>
>> You need to do it not by blacklisting, but with whitelisting. You need
>> to whitelist VF devices. My port flavours patchset might help with this.
>
>May be i can use netdev_has_lower_dev() helper to make sure that the slave
I don't see such function in the code.
>device is not an upper dev.
>Can you point to your port flavours patchset? Is it upstream?
I sent rfc couple of weeks ago:
[patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
>
>>
>>
>> > + return false;
>> > +
>> > + return true;
>> > +}
>> > +
>> > +static int
>> > +bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>> > +{
>> > + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> > +
>> > + if (!bypass_validate_event_dev(event_dev))
>> > + return NOTIFY_DONE;
>> > +
>> > + switch (event) {
>> > + case NETDEV_REGISTER:
>> > + return bypass_slave_register(event_dev);
>> > + case NETDEV_UNREGISTER:
>> > + return bypass_slave_unregister(event_dev);
>> > + case NETDEV_UP:
>> > + case NETDEV_DOWN:
>> > + case NETDEV_CHANGE:
>> > + return bypass_slave_link_change(event_dev);
>> > + default:
>> > + return NOTIFY_DONE;
>> > + }
>> > +}
>> > +
>> > +static struct notifier_block bypass_notifier = {
>> > + .notifier_call = bypass_event,
>> > +};
>> > +
>> > +int bypass_open(struct net_device *dev)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> > + struct net_device *active_netdev, *backup_netdev;
>> > + int err;
>> > +
>> > + netif_carrier_off(dev);
>> > + netif_tx_wake_all_queues(dev);
>> > +
>> > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > + if (active_netdev) {
>> > + err = dev_open(active_netdev);
>> > + if (err)
>> > + goto err_active_open;
>> > + }
>> > +
>> > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > + if (backup_netdev) {
>> > + err = dev_open(backup_netdev);
>> > + if (err)
>> > + goto err_backup_open;
>> > + }
>> > +
>> > + return 0;
>> > +
>> > +err_backup_open:
>> > + dev_close(active_netdev);
>> > +err_active_open:
>> > + netif_tx_disable(dev);
>> > + return err;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_open);
>> > +
>> > +int bypass_close(struct net_device *dev)
>> > +{
>> > + struct bypass_info *vi = netdev_priv(dev);
>> This should be probably "bi"
>
>Yes.
>
>
>>
>>
>> > + struct net_device *slave_netdev;
>> > +
>> > + netif_tx_disable(dev);
>> > +
>> > + slave_netdev = rtnl_dereference(vi->active_netdev);
>> > + if (slave_netdev)
>> > + dev_close(slave_netdev);
>> > +
>> > + slave_netdev = rtnl_dereference(vi->backup_netdev);
>> > + if (slave_netdev)
>> > + dev_close(slave_netdev);
>> > +
>> > + return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_close);
>> > +
>> > +static netdev_tx_t bypass_drop_xmit(struct sk_buff *skb, struct net_device *dev)
>> > +{
>> > + atomic_long_inc(&dev->tx_dropped);
>> > + dev_kfree_skb_any(skb);
>> > + return NETDEV_TX_OK;
>> > +}
>> > +
>> > +netdev_tx_t bypass_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> If you rename the other variable to "bpmaster_dev", it would be nice to
>> rename this to bpinfo or something more descriptive. "bi" is too short
>> to know what that is right away.
>
>Will rename bypass_netdev to bypass_dev. bypass indicates that it is
>an upper master dev.
>
>
>>
>>
>> > + struct net_device *xmit_dev;
>> Don't mix "dev" and "netdev" in one .c file. Just use "dev" for all.
>
>OK.
>
>
>>
>>
>>
>> > +
>> > + /* Try xmit via active netdev followed by backup netdev */
>> > + xmit_dev = rcu_dereference_bh(bi->active_netdev);
>> > + if (!xmit_dev || !bypass_xmit_ready(xmit_dev)) {
>> > + xmit_dev = rcu_dereference_bh(bi->backup_netdev);
>> > + if (!xmit_dev || !bypass_xmit_ready(xmit_dev))
>> > + return bypass_drop_xmit(skb, dev);
>> > + }
>> > +
>> > + skb->dev = xmit_dev;
>> > + skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
>> > +
>> > + return dev_queue_xmit(skb);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_start_xmit);
>> > +
>> > +u16 bypass_select_queue(struct net_device *dev, struct sk_buff *skb,
>> > + void *accel_priv, select_queue_fallback_t fallback)
>> > +{
>> > + /* This helper function exists to help dev_pick_tx get the correct
>> > + * destination queue. Using a helper function skips a call to
>> > + * skb_tx_hash and will put the skbs in the queue we expect on their
>> > + * way down to the bonding driver.
>> > + */
>> > + u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
>> > +
>> > + /* Save the original txq to restore before passing to the driver */
>> > + qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
>> > +
>> > + if (unlikely(txq >= dev->real_num_tx_queues)) {
>> > + do {
>> > + txq -= dev->real_num_tx_queues;
>> > + } while (txq >= dev->real_num_tx_queues);
>> > + }
>> > +
>> > + return txq;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_select_queue);
>> > +
>> > +/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
>> > + * that some drivers can provide 32bit values only.
>> > + */
>> > +static void bypass_fold_stats(struct rtnl_link_stats64 *_res,
>> > + const struct rtnl_link_stats64 *_new,
>> > + const struct rtnl_link_stats64 *_old)
>> > +{
>> > + const u64 *new = (const u64 *)_new;
>> > + const u64 *old = (const u64 *)_old;
>> > + u64 *res = (u64 *)_res;
>> > + int i;
>> > +
>> > + for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
>> > + u64 nv = new[i];
>> > + u64 ov = old[i];
>> > + s64 delta = nv - ov;
>> > +
>> > + /* detects if this particular field is 32bit only */
>> > + if (((nv | ov) >> 32) == 0)
>> > + delta = (s64)(s32)((u32)nv - (u32)ov);
>> > +
>> > + /* filter anomalies, some drivers reset their stats
>> > + * at down/up events.
>> > + */
>> > + if (delta > 0)
>> > + res[i] += delta;
>> > + }
>> > +}
>> > +
>> > +void bypass_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> You can WARN_ON and return in case the dev is not bypass master, just
>> to catch buggy drivers. Same with other helpers.
>
>I can make this static and not export this helper as well as all
>bypass_netdev ops.
Ok.
>
>>
>>
>> > + const struct rtnl_link_stats64 *new;
>> > + struct rtnl_link_stats64 temp;
>> > + struct net_device *slave_netdev;
>> > +
>> > + spin_lock(&bi->stats_lock);
>> > + memcpy(stats, &bi->bypass_stats, sizeof(*stats));
>> > +
>> > + rcu_read_lock();
>> > +
>> > + slave_netdev = rcu_dereference(bi->active_netdev);
>> > + if (slave_netdev) {
>> > + new = dev_get_stats(slave_netdev, &temp);
>> > + bypass_fold_stats(stats, new, &bi->active_stats);
>> > + memcpy(&bi->active_stats, new, sizeof(*new));
>> > + }
>> > +
>> > + slave_netdev = rcu_dereference(bi->backup_netdev);
>> > + if (slave_netdev) {
>> > + new = dev_get_stats(slave_netdev, &temp);
>> > + bypass_fold_stats(stats, new, &bi->backup_stats);
>> > + memcpy(&bi->backup_stats, new, sizeof(*new));
>> > + }
>> > +
>> > + rcu_read_unlock();
>> > +
>> > + memcpy(&bi->bypass_stats, stats, sizeof(*stats));
>> > + spin_unlock(&bi->stats_lock);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_get_stats);
>> > +
>> > +int bypass_change_mtu(struct net_device *dev, int new_mtu)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> > + struct net_device *active_netdev, *backup_netdev;
>> > + int ret = 0;
>> Pointless initialization.
>>
>>
>> > +
>> > + active_netdev = rcu_dereference(bi->active_netdev);
>> > + if (active_netdev) {
>> > + ret = dev_set_mtu(active_netdev, new_mtu);
>> > + if (ret)
>> > + return ret;
>> > + }
>> > +
>> > + backup_netdev = rcu_dereference(bi->backup_netdev);
>> > + if (backup_netdev) {
>> > + ret = dev_set_mtu(backup_netdev, new_mtu);
>> > + if (ret) {
>> > + dev_set_mtu(active_netdev, dev->mtu);
>> > + return ret;
>> > + }
>> > + }
>> > +
>> > + dev->mtu = new_mtu;
>> > + return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_change_mtu);
>> > +
>> > +void bypass_set_rx_mode(struct net_device *dev)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> > + struct net_device *slave_netdev;
>> > +
>> > + rcu_read_lock();
>> > +
>> > + slave_netdev = rcu_dereference(bi->active_netdev);
>> > + if (slave_netdev) {
>> > + dev_uc_sync_multiple(slave_netdev, dev);
>> > + dev_mc_sync_multiple(slave_netdev, dev);
>> > + }
>> > +
>> > + slave_netdev = rcu_dereference(bi->backup_netdev);
>> > + if (slave_netdev) {
>> > + dev_uc_sync_multiple(slave_netdev, dev);
>> > + dev_mc_sync_multiple(slave_netdev, dev);
>> > + }
>> > +
>> > + rcu_read_unlock();
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_set_rx_mode);
>> > +
>> > +static const struct net_device_ops bypass_netdev_ops = {
>> > + .ndo_open = bypass_open,
>> > + .ndo_stop = bypass_close,
>> > + .ndo_start_xmit = bypass_start_xmit,
>> > + .ndo_select_queue = bypass_select_queue,
>> > + .ndo_get_stats64 = bypass_get_stats,
>> > + .ndo_change_mtu = bypass_change_mtu,
>> > + .ndo_set_rx_mode = bypass_set_rx_mode,
>> > + .ndo_validate_addr = eth_validate_addr,
>> > + .ndo_features_check = passthru_features_check,
>> > +};
>> > +
>> > +#define BYPASS_DRV_NAME "bypass"
>> > +#define BYPASS_DRV_VERSION "0.1"
>> > +
>> > +static void bypass_ethtool_get_drvinfo(struct net_device *dev,
>> > + struct ethtool_drvinfo *drvinfo)
>> > +{
>> > + strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>> > + strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>> > +}
>> > +
>> > +int bypass_ethtool_get_link_ksettings(struct net_device *dev,
>> > + struct ethtool_link_ksettings *cmd)
>> > +{
>> > + struct bypass_info *bi = netdev_priv(dev);
>> > + struct net_device *slave_netdev;
>> > +
>> > + slave_netdev = rtnl_dereference(bi->active_netdev);
>> > + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> > + slave_netdev = rtnl_dereference(bi->backup_netdev);
>> > + if (!slave_netdev || !bypass_xmit_ready(slave_netdev)) {
>> > + cmd->base.duplex = DUPLEX_UNKNOWN;
>> > + cmd->base.port = PORT_OTHER;
>> > + cmd->base.speed = SPEED_UNKNOWN;
>> > +
>> > + return 0;
>> > + }
>> > + }
>> > +
>> > + return __ethtool_get_link_ksettings(slave_netdev, cmd);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_ethtool_get_link_ksettings);
>> > +
>> > +static const struct ethtool_ops bypass_ethtool_ops = {
>> > + .get_drvinfo = bypass_ethtool_get_drvinfo,
>> > + .get_link = ethtool_op_get_link,
>> > + .get_link_ksettings = bypass_ethtool_get_link_ksettings,
>> > +};
>> > +
>> > +static void bypass_register_existing_slave(struct net_device *bypass_netdev)
>> > +{
>> > + struct net *net = dev_net(bypass_netdev);
>> > + struct net_device *dev;
>> > +
>> > + rtnl_lock();
>> > + for_each_netdev(net, dev) {
>> > + if (dev == bypass_netdev)
>> > + continue;
>> > + if (!bypass_validate_event_dev(dev))
>> > + continue;
>> > + if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>> > + bypass_slave_register(dev);
>> > + }
>> > + rtnl_unlock();
>> > +}
>> > +
>> > +int bypass_master_register(struct net_device *dev, struct bypass_ops *ops,
>> > + struct bypass_master **pbypass_master)
>> > +{
>> > + struct bypass_master *bypass_master;
>> > +
>> > + bypass_master = kzalloc(sizeof(*bypass_master), GFP_KERNEL);
>> > + if (!bypass_master)
>> > + return -ENOMEM;
>> > +
>> > + rcu_assign_pointer(bypass_master->ops, ops);
>> > + dev_hold(dev);
>> > + dev->priv_flags |= IFF_BYPASS;
>> > + rcu_assign_pointer(bypass_master->bypass_netdev, dev);
>> > +
>> > + spin_lock(&bypass_lock);
>> > + list_add_tail(&bypass_master->list, &bypass_master_list);
>> > + spin_unlock(&bypass_lock);
>> > +
>> > + bypass_register_existing_slave(dev);
>> > +
>> > + *pbypass_master = bypass_master;
>> > + return 0;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_register);
>> > +
>> > +void bypass_master_unregister(struct bypass_master *bypass_master)
>> > +{
>> > + struct net_device *bypass_netdev;
>> > +
>> > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > +
>> > + bypass_netdev->priv_flags &= ~IFF_BYPASS;
>> > + dev_put(bypass_netdev);
>> > +
>> > + spin_lock(&bypass_lock);
>> > + list_del(&bypass_master->list);
>> > + spin_unlock(&bypass_lock);
>> > +
>> > + kfree(bypass_master);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_unregister);
>> > +
>> > +int bypass_master_create(struct net_device *backup_netdev,
>> > + struct bypass_master **pbypass_master)
>> > +{
>> > + struct device *dev = backup_netdev->dev.parent;
>> > + struct net_device *bypass_netdev;
>> > + int err;
>> > +
>> > + /* Alloc at least 2 queues, for now we are going with 16 assuming
>> > + * that most devices being bonded won't have too many queues.
>> > + */
>> > + bypass_netdev = alloc_etherdev_mq(sizeof(struct bypass_info), 16);
>> > + if (!bypass_netdev) {
>> > + dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> > + return -ENOMEM;
>> > + }
>> > +
>> > + dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> > + SET_NETDEV_DEV(bypass_netdev, dev);
>> > +
>> > + bypass_netdev->netdev_ops = &bypass_netdev_ops;
>> > + bypass_netdev->ethtool_ops = &bypass_ethtool_ops;
>> > +
>> > + /* Initialize the device options */
>> > + bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> > + bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> > + IFF_TX_SKB_SHARING);
>> > +
>> > + /* don't acquire bypass netdev's netif_tx_lock when transmitting */
>> > + bypass_netdev->features |= NETIF_F_LLTX;
>> > +
>> > + /* Don't allow bypass devices to change network namespaces. */
>> > + bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>> > +
>> > + bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> > + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> > + NETIF_F_HIGHDMA | NETIF_F_LRO;
>> > +
>> > + bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> > + bypass_netdev->features |= bypass_netdev->hw_features;
>> > +
>> > + memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>> > + bypass_netdev->addr_len);
>> > +
>> > + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> > + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> > +
>> > + err = register_netdev(bypass_netdev);
>> > + if (err < 0) {
>> > + dev_err(dev, "Unable to register bypass_netdev!\n");
>> > + goto err_register_netdev;
>> > + }
>> > +
>> > + netif_carrier_off(bypass_netdev);
>> > +
>> > + err = bypass_master_register(bypass_netdev, NULL, pbypass_master);
>> > + if (err < 0)
>> just "if (err)" would do.
>
>OK
>
>>
>>
>> > + goto err_bypass;
>> > +
>> > + return 0;
>> > +
>> > +err_bypass:
>> > + unregister_netdev(bypass_netdev);
>> > +err_register_netdev:
>> > + free_netdev(bypass_netdev);
>> > +
>> > + return err;
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_create);
>> > +
>> > +void bypass_master_destroy(struct bypass_master *bypass_master)
>> > +{
>> > + struct net_device *bypass_netdev;
>> > + struct net_device *slave_netdev;
>> > + struct bypass_info *bi;
>> > +
>> > + if (!bypass_master)
>> > + return;
>> > +
>> > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > + bi = netdev_priv(bypass_netdev);
>> > +
>> > + netif_device_detach(bypass_netdev);
>> > +
>> > + rtnl_lock();
>> > +
>> > + slave_netdev = rtnl_dereference(bi->active_netdev);
>> > + if (slave_netdev)
>> > + bypass_slave_unregister(slave_netdev);
>> > +
>> > + slave_netdev = rtnl_dereference(bi->backup_netdev);
>> > + if (slave_netdev)
>> > + bypass_slave_unregister(slave_netdev);
>> > +
>> > + bypass_master_unregister(bypass_master);
>> > +
>> > + unregister_netdevice(bypass_netdev);
>> > +
>> > + rtnl_unlock();
>> > +
>> > + free_netdev(bypass_netdev);
>> > +}
>> > +EXPORT_SYMBOL_GPL(bypass_master_destroy);
>> > +
>> > +static __init int
>> > +bypass_init(void)
>> > +{
>> > + register_netdevice_notifier(&bypass_notifier);
>> > +
>> > + return 0;
>> > +}
>> > +module_init(bypass_init);
>> > +
>> > +static __exit
>> > +void bypass_exit(void)
>> > +{
>> > + unregister_netdevice_notifier(&bypass_notifier);
>> > +}
>> > +module_exit(bypass_exit);
>> > +
>> > +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>> > +MODULE_LICENSE("GPL v2");
>> > --
>> > 2.14.3
>> >
>
^ permalink raw reply
* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Michael S. Tsirkin @ 2018-04-18 14:06 UTC (permalink / raw)
To: Vlad Yasevich
Cc: Marcelo Ricardo Leitner, nhorman, netdev, virtualization,
linux-sctp, Vladislav Yasevich
In-Reply-To: <6bc762f6-d6fb-5471-2893-a888cce199f9@redhat.com>
On Tue, Apr 17, 2018 at 04:35:18PM -0400, Vlad Yasevich wrote:
> On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
> > On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
> >> Now that we have SCTP offload capabilities in the kernel, we can add
> >> them to virtio as well. First step is SCTP checksum.
> >
> > Thanks.
> >
> >> As for GSO, the way sctp GSO is currently implemented buys us nothing
> >> in added support to virtio. To add true GSO, would require a lot of
> >> re-work inside of SCTP and would require extensions to the virtio
> >> net header to carry extra sctp data.
> >
> > Can you please elaborate more on this? Is this because SCTP GSO relies
> > on the gso skb format for knowing how to segment it instead of having
> > a list of sizes?
> >
>
> it's mainly because all the true segmentation, placing data into chunks,
> has already happened. All that GSO does is allow for higher bundling
> rate between VMs. If that is all SCTP GSO ever going to do, that fine,
> but the goal is to do real GSO eventually and potentially reduce the
> amount of memory copying we are doing.
> If we do that, any current attempt at GSO in virtio would have to be
> depricated and we'd need GSO2 or something like that.
Batching helps virtualization *a lot* though.
Are there actual plans for GSO2? Is it just for SCTP?
>
> This is why, after doing the GSO support, I decided not to include it.
>
> -vlad
> > Marcelo
> >
^ permalink raw reply
* Re: [PATCH] drm: Print unadorned pointers
From: Greg Kroah-Hartman @ 2018-04-18 15:52 UTC (permalink / raw)
To: Alexey Brodkin
Cc: amd-gfx, Heiko Stübner, Tom Lendacky, David Airlie,
Daniel Vetter, Joonas Lahtinen, dri-devel, Russell King,
Cihangir Akturk, Benjamin Gaignard, Ingo Molnar, linux-kernel,
David (ChunMing) Zhou, Thomas Hellstrom, Sinclair Yeh,
Vincent Abriou, Jerry (Fangzhi) Zuo, Krzysztof Kozlowski,
Maxime Ripard, Chen-Yu Tsai, Sandy Huang, Roger He,
Gustavo Padovan
In-Reply-To: <20180418092450.13798-1-abrodkin@synopsys.com>
On Wed, Apr 18, 2018 at 12:24:50PM +0300, Alexey Brodkin wrote:
> After commit ad67b74 ("printk: hash addresses printed with %p")
> pointers are being hashed when printed. However, this makes
> debug output completely useless. Switch to %px in order to see the
> unadorned kernel pointers.
>
> This was done with the following one-liner:
> find drivers/gpu/drm -type f -name "*.c" -exec sed -r -i '/DRM_DEBUG|KERN_DEBUG|pr_debug/ s/%p\b/%px/g' {} +
>
> Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tobin C. Harding <me@tobin.cc>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Benjamin Gaignard <benjamin.gaignard@linaro.org>
> Cc: Chen-Yu Tsai <wens@csie.org>
> Cc: Christian Gmeiner <christian.gmeiner@gmail.com>
> Cc: "Christian König" <christian.koenig@amd.com>
> Cc: Cihangir Akturk <cakturk@gmail.com>
> Cc: CK Hu <ck.hu@mediatek.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Dave Airlie <airlied@redhat.com>
> Cc: David Airlie <airlied@linux.ie>
> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Gustavo Padovan <gustavo@padovan.org>
> Cc: Harry Wentland <harry.wentland@amd.com>
> Cc: "Heiko Stübner" <heiko@sntech.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Jani Nikula <jani.nikula@linux.intel.com>
> Cc: "Jerry (Fangzhi) Zuo" <Jerry.Zuo@amd.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Krzysztof Kozlowski <krzk@kernel.org>
> Cc: "Leo (Sunpeng) Li" <sunpeng.li@amd.com>
> Cc: Lucas Stach <l.stach@pengutronix.de>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Matthias Brugger <matthias.bgg@gmail.com>
> Cc: Maxime Ripard <maxime.ripard@bootlin.com>
> Cc: "Michel Dänzer" <michel.daenzer@amd.com>
> Cc: Oded Gabbay <oded.gabbay@gmail.com>
> Cc: Philipp Zabel <p.zabel@pengutronix.de>
> Cc: Rob Clark <robdclark@gmail.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Roger He <Hongbo.He@amd.com>
> Cc: Roman Li <Roman.Li@amd.com>
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: Samuel Li <Samuel.Li@amd.com>
> Cc: Sandy Huang <hjc@rock-chips.com>
> Cc: Sean Paul <seanpaul@chromium.org>
> Cc: Shirish S <shirish.s@amd.com>
> Cc: Sinclair Yeh <syeh@vmware.com>
> Cc: Thomas Hellstrom <thellstrom@vmware.com>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Tony Cheng <Tony.Cheng@amd.com>
> Cc: Vincent Abriou <vincent.abriou@st.com>
> Cc: VMware Graphics <linux-graphics-maintainer@vmware.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-arm-msm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-mediatek@lists.infradead.org
> Cc: linux-rockchip@lists.infradead.org
> Cc: etnaviv@lists.freedesktop.org
> Cc: freedreno@lists.freedesktop.org
> Cc: amd-gfx@lists.freedesktop.org
> Cc: intel-gfx@lists.freedesktop.org
> Cc: virtualization@lists.linux-foundation.org
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 +++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 10 ++---
> drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 18 ++++-----
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 +++----
> .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 2 +-
> drivers/gpu/drm/armada/armada_gem.c | 12 +++---
> drivers/gpu/drm/drm_atomic.c | 44 +++++++++++-----------
> drivers/gpu/drm/drm_bufs.c | 8 ++--
> drivers/gpu/drm/drm_dp_mst_topology.c | 4 +-
> drivers/gpu/drm/drm_lease.c | 6 +--
> drivers/gpu/drm/drm_lock.c | 2 +-
> drivers/gpu/drm/drm_scatter.c | 4 +-
> drivers/gpu/drm/etnaviv/etnaviv_drv.c | 6 +--
> drivers/gpu/drm/i810/i810_dma.c | 2 +-
> drivers/gpu/drm/i915/i915_perf.c | 2 +-
> drivers/gpu/drm/i915/intel_display.c | 2 +-
> drivers/gpu/drm/i915/intel_guc_ct.c | 4 +-
> drivers/gpu/drm/i915/intel_guc_submission.c | 2 +-
> drivers/gpu/drm/i915/intel_uc_fw.c | 2 +-
> drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +-
> drivers/gpu/drm/mga/mga_warp.c | 2 +-
> drivers/gpu/drm/msm/msm_drv.c | 4 +-
> drivers/gpu/drm/qxl/qxl_cmd.c | 4 +-
> drivers/gpu/drm/qxl/qxl_fb.c | 2 +-
> drivers/gpu/drm/qxl/qxl_ttm.c | 2 +-
> drivers/gpu/drm/radeon/radeon_display.c | 2 +-
> drivers/gpu/drm/radeon/radeon_dp_mst.c | 12 +++---
> drivers/gpu/drm/radeon/radeon_object.c | 2 +-
> drivers/gpu/drm/rockchip/rockchip_drm_fbdev.c | 2 +-
> drivers/gpu/drm/savage/savage_bci.c | 2 +-
> drivers/gpu/drm/sti/sti_gdp.c | 4 +-
> drivers/gpu/drm/sti/sti_mixer.c | 2 +-
> drivers/gpu/drm/sun4i/sun4i_crtc.c | 4 +-
> drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +-
> drivers/gpu/drm/udl/udl_fb.c | 2 +-
> drivers/gpu/drm/via/via_dma.c | 4 +-
> drivers/gpu/drm/via/via_irq.c | 2 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c | 2 +-
> 45 files changed, 120 insertions(+), 120 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 1d6e1479da38..32e85fe83152 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -438,7 +438,7 @@ static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
> if (!bo_va_entry)
> return -ENOMEM;
>
> - pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
> + pr_debug("\t add VA 0x%llx - 0x%llx to vm %px\n", va,
> va + bo_size, vm);
>
> /* Add BO to VM internal data structures*/
<snip>
As others have pointed out, please do not do this. Use %pK as needed.
And perhaps revisit some of the "need" for these pointers at all in the
first place...
thanks,
greg k-h
^ permalink raw reply
* Re: [RFC] virtio: Use DMA MAP API for devices without an IOMMU
From: Michael S. Tsirkin @ 2018-04-18 16:20 UTC (permalink / raw)
To: Anshuman Khandual
Cc: robh, Benjamin Herrenschmidt, linux-kernel, virtualization,
Christoph Hellwig, joe, linuxppc-dev, elfring, david
In-Reply-To: <002f153f-452d-f64b-4fc7-7f3383b39162@linux.vnet.ibm.com>
On Wed, Apr 18, 2018 at 08:47:10AM +0530, Anshuman Khandual wrote:
> On 04/15/2018 05:41 PM, Christoph Hellwig wrote:
> > On Fri, Apr 06, 2018 at 06:37:18PM +1000, Benjamin Herrenschmidt wrote:
> >>>> implemented as DMA API which the virtio core understands. There is no
> >>>> need for an IOMMU to be involved for the device representation in this
> >>>> case IMHO.
> >>>
> >>> This whole virtio translation issue is a mess. I think we need to
> >>> switch it to the dma API, and then quirk the legacy case to always
> >>> use the direct mapping inside the dma API.
> >>
> >> Fine with using a dma API always on the Linux side, but we do want to
> >> special case virtio still at the arch and qemu side to have a "direct
> >> mapping" mode. Not sure how (special flags on PCI devices) to avoid
> >> actually going through an emulated IOMMU on the qemu side, because that
> >> slows things down, esp. with vhost.
> >>
> >> IE, we can't I think just treat it the same as a physical device.
> >
> > We should have treated it like a physical device from the start, but
> > that device has unfortunately sailed.
> >
> > But yes, we'll need a per-device quirk that says 'don't attach an
> > iommu'.
>
> How about doing it per platform basis as suggested in this RFC through
> an arch specific callback. Because all the virtio devices in the given
> platform would require and exercise this option (to avail bounce buffer
> mechanism for secure guests as an example). So the flag basically is a
> platform specific one not a device specific one.
That's not the case. A single platform can have a mix of virtio and
non-virtio devices. Same applies even within virtio, e.g. the balloon
device always bypasses an iommu. Further, QEMU supports out of process
devices some of which might bypass the IOMMU.
--
MST
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 16:44 UTC (permalink / raw)
To: Eric Dumazet
Cc: Michael S. Tsirkin, netdev, linux-kernel, virtualization,
Eric Dumazet, Joby Poriyath, Ben Hutchings, David S. Miller
In-Reply-To: <3e65977e-53cd-bf09-bc4b-0ce40e9091fe@gmail.com>
On Wed, 18 Apr 2018, Eric Dumazet wrote:
>
>
> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
> > The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
> > fails (later patches change it to kvzalloc).
> >
> > The problem with this is that if the vzalloc function is actually used,
> > virtio_net doesn't work (because it expects that the extra memory should
> > be accessible with DMA-API and memory allocated with vzalloc isn't).
> >
> > This patch changes it back to kzalloc and adds a warning if the allocated
> > size is too large (the allocation is unreliable in this case).
> >
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
> >
> > ---
> > net/core/dev.c | 3 ++-
> > 1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > Index: linux-2.6/net/core/dev.c
> > ===================================================================
> > --- linux-2.6.orig/net/core/dev.c 2018-04-16 21:08:36.000000000 +0200
> > +++ linux-2.6/net/core/dev.c 2018-04-18 16:24:43.000000000 +0200
> > @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
> > /* ensure 32-byte alignment of whole construct */
> > alloc_size += NETDEV_ALIGN - 1;
> >
> > - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> > + WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
> > + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> > if (!p)
> > return NULL;
> >
> >
>
> Since when a net_device needs to be in DMA zone ???
>
> I would rather fix virtio_net, this looks very suspect to me.
>
> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.
The structure net_device is followed by arbitrary driver-specific data
(accessible with the function netdev_priv). And for virtio-net, these
driver-specific data must be in DMA memory.
Mikulas
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Eric Dumazet @ 2018-04-18 16:51 UTC (permalink / raw)
To: Mikulas Patocka, Eric Dumazet
Cc: Michael S. Tsirkin, netdev, linux-kernel, virtualization,
Eric Dumazet, Joby Poriyath, Ben Hutchings, David S. Miller
In-Reply-To: <alpine.LRH.2.02.1804181218270.19136@file01.intranet.prod.int.rdu2.redhat.com>
On 04/18/2018 09:44 AM, Mikulas Patocka wrote:
>
>
> On Wed, 18 Apr 2018, Eric Dumazet wrote:
>
>>
>>
>> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
>>> The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
>>> fails (later patches change it to kvzalloc).
>>>
>>> The problem with this is that if the vzalloc function is actually used,
>>> virtio_net doesn't work (because it expects that the extra memory should
>>> be accessible with DMA-API and memory allocated with vzalloc isn't).
>>>
>>> This patch changes it back to kzalloc and adds a warning if the allocated
>>> size is too large (the allocation is unreliable in this case).
>>>
>>> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>>> Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
>>>
>>> ---
>>> net/core/dev.c | 3 ++-
>>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> Index: linux-2.6/net/core/dev.c
>>> ===================================================================
>>> --- linux-2.6.orig/net/core/dev.c 2018-04-16 21:08:36.000000000 +0200
>>> +++ linux-2.6/net/core/dev.c 2018-04-18 16:24:43.000000000 +0200
>>> @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
>>> /* ensure 32-byte alignment of whole construct */
>>> alloc_size += NETDEV_ALIGN - 1;
>>>
>>> - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>> + WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
>>> + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>> if (!p)
>>> return NULL;
>>>
>>>
>>
>> Since when a net_device needs to be in DMA zone ???
>>
>> I would rather fix virtio_net, this looks very suspect to me.
>>
>> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
>> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.
>
> The structure net_device is followed by arbitrary driver-specific data
> (accessible with the function netdev_priv). And for virtio-net, these
> driver-specific data must be in DMA memory.
I get that, but how is the original xenvif problem will be solved ?
Your patch would add a bug in some other driver(s)
I suggest that virtio_net clearly identifies which part needs a specific allocation
and does its itself, instead of abusing the netdev_priv storage.
Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Eric Dumazet @ 2018-04-18 16:51 UTC (permalink / raw)
To: Mikulas Patocka, Eric Dumazet
Cc: Michael S. Tsirkin, netdev, linux-kernel, virtualization,
Eric Dumazet, Joby Poriyath, Ben Hutchings, David S. Miller
In-Reply-To: <alpine.LRH.2.02.1804181218270.19136@file01.intranet.prod.int.rdu2.redhat.com>
On 04/18/2018 09:44 AM, Mikulas Patocka wrote:
>
>
> On Wed, 18 Apr 2018, Eric Dumazet wrote:
>
>>
>>
>> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
>>> The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
>>> fails (later patches change it to kvzalloc).
>>>
>>> The problem with this is that if the vzalloc function is actually used,
>>> virtio_net doesn't work (because it expects that the extra memory should
>>> be accessible with DMA-API and memory allocated with vzalloc isn't).
>>>
>>> This patch changes it back to kzalloc and adds a warning if the allocated
>>> size is too large (the allocation is unreliable in this case).
>>>
>>> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>>> Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
>>>
>>> ---
>>> net/core/dev.c | 3 ++-
>>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> Index: linux-2.6/net/core/dev.c
>>> ===================================================================
>>> --- linux-2.6.orig/net/core/dev.c 2018-04-16 21:08:36.000000000 +0200
>>> +++ linux-2.6/net/core/dev.c 2018-04-18 16:24:43.000000000 +0200
>>> @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
>>> /* ensure 32-byte alignment of whole construct */
>>> alloc_size += NETDEV_ALIGN - 1;
>>>
>>> - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>> + WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
>>> + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>> if (!p)
>>> return NULL;
>>>
>>>
>>
>> Since when a net_device needs to be in DMA zone ???
>>
>> I would rather fix virtio_net, this looks very suspect to me.
>>
>> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
>> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.
>
> The structure net_device is followed by arbitrary driver-specific data
> (accessible with the function netdev_priv). And for virtio-net, these
> driver-specific data must be in DMA memory.
I get that, but how is the original xenvif problem will be solved ?
Your patch would add a bug in some other driver(s)
I suggest that virtio_net clearly identifies which part needs a specific allocation
and does its itself, instead of abusing the netdev_priv storage.
Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: David Miller @ 2018-04-18 17:46 UTC (permalink / raw)
To: mpatocka
Cc: eric.dumazet, mst, netdev, linux-kernel, virtualization, edumazet,
joby.poriyath, bhutchings
In-Reply-To: <alpine.LRH.2.02.1804181218270.19136@file01.intranet.prod.int.rdu2.redhat.com>
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 18 Apr 2018 12:44:25 -0400 (EDT)
> The structure net_device is followed by arbitrary driver-specific data
> (accessible with the function netdev_priv). And for virtio-net, these
> driver-specific data must be in DMA memory.
And we are saying that this assumption is wrong and needs to be
corrected.
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: David Miller @ 2018-04-18 17:47 UTC (permalink / raw)
To: eric.dumazet
Cc: mst, netdev, linux-kernel, virtualization, edumazet, mpatocka,
joby.poriyath, bhutchings
In-Reply-To: <5f4e1286-b79f-0b9f-9a30-47d7654f3889@gmail.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Apr 2018 09:51:25 -0700
> I suggest that virtio_net clearly identifies which part needs a specific allocation
> and does its itself, instead of abusing the netdev_priv storage.
>
> Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
+1
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 17:49 UTC (permalink / raw)
To: Eric Dumazet
Cc: Michael S. Tsirkin, netdev, linux-kernel, virtualization,
Eric Dumazet, Ben Hutchings, David S. Miller
In-Reply-To: <5f4e1286-b79f-0b9f-9a30-47d7654f3889@gmail.com>
On Wed, 18 Apr 2018, Eric Dumazet wrote:
>
>
> On 04/18/2018 09:44 AM, Mikulas Patocka wrote:
> >
> >
> > On Wed, 18 Apr 2018, Eric Dumazet wrote:
> >
> >>
> >>
> >> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
> >>> The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
> >>> fails (later patches change it to kvzalloc).
> >>>
> >>> The problem with this is that if the vzalloc function is actually used,
> >>> virtio_net doesn't work (because it expects that the extra memory should
> >>> be accessible with DMA-API and memory allocated with vzalloc isn't).
> >>>
> >>> This patch changes it back to kzalloc and adds a warning if the allocated
> >>> size is too large (the allocation is unreliable in this case).
> >>>
> >>> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> >>> Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
> >>>
> >>> ---
> >>> net/core/dev.c | 3 ++-
> >>> 1 file changed, 2 insertions(+), 1 deletion(-)
> >>>
> >>> Index: linux-2.6/net/core/dev.c
> >>> ===================================================================
> >>> --- linux-2.6.orig/net/core/dev.c 2018-04-16 21:08:36.000000000 +0200
> >>> +++ linux-2.6/net/core/dev.c 2018-04-18 16:24:43.000000000 +0200
> >>> @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
> >>> /* ensure 32-byte alignment of whole construct */
> >>> alloc_size += NETDEV_ALIGN - 1;
> >>>
> >>> - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> >>> + WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
> >>> + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> >>> if (!p)
> >>> return NULL;
> >>>
> >>>
> >>
> >> Since when a net_device needs to be in DMA zone ???
> >>
> >> I would rather fix virtio_net, this looks very suspect to me.
> >>
> >> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
> >> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.
> >
> > The structure net_device is followed by arbitrary driver-specific data
> > (accessible with the function netdev_priv). And for virtio-net, these
> > driver-specific data must be in DMA memory.
>
> I get that, but how is the original xenvif problem will be solved ?
>
> Your patch would add a bug in some other driver(s)
>
> I suggest that virtio_net clearly identifies which part needs a specific allocation
> and does its itself, instead of abusing the netdev_priv storage.
>
> Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
There are drivers that need to do DMA to driver-specific area.
And there are drivers that need driver-specific area larger than kmalloc
limit.
These are conflicting requirements and one of those drivers must be
changed.
I suggest to change the drivers that need large driver-specific area.
That's why I added the WARN_ON, so that they can be identified.
Mikulas
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 17:53 UTC (permalink / raw)
To: David Miller
Cc: eric.dumazet, mst, netdev, linux-kernel, virtualization, edumazet,
bhutchings
In-Reply-To: <20180418.134651.2225112489265654270.davem@davemloft.net>
On Wed, 18 Apr 2018, David Miller wrote:
> From: Mikulas Patocka <mpatocka@redhat.com>
> Date: Wed, 18 Apr 2018 12:44:25 -0400 (EDT)
>
> > The structure net_device is followed by arbitrary driver-specific data
> > (accessible with the function netdev_priv). And for virtio-net, these
> > driver-specific data must be in DMA memory.
>
> And we are saying that this assumption is wrong and needs to be
> corrected.
So, try to find all the networking drivers that to DMA to the private
area.
The problem here is that kvzalloc usually returns DMA-able area, but it
may return non-DMA area rarely, if the memory is too fragmented. So, we
are in a situation, where some networking drivers will randomly fail. Go
and find them.
Mikulas
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 17:55 UTC (permalink / raw)
To: David Miller
Cc: eric.dumazet, mst, netdev, linux-kernel, virtualization, edumazet,
joby.poriyath, bhutchings
In-Reply-To: <20180418.134721.283381724211219025.davem@davemloft.net>
On Wed, 18 Apr 2018, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 18 Apr 2018 09:51:25 -0700
>
> > I suggest that virtio_net clearly identifies which part needs a specific allocation
> > and does its itself, instead of abusing the netdev_priv storage.
> >
> > Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
>
> +1
And what about the other networking drivers that may do the same thing?
Mikulas
^ permalink raw reply
* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Michael S. Tsirkin @ 2018-04-18 18:00 UTC (permalink / raw)
To: David Miller
Cc: eric.dumazet, netdev, linux-kernel, virtualization, edumazet,
mpatocka, joby.poriyath, bhutchings
In-Reply-To: <20180418.134721.283381724211219025.davem@davemloft.net>
On Wed, Apr 18, 2018 at 01:47:21PM -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 18 Apr 2018 09:51:25 -0700
>
> > I suggest that virtio_net clearly identifies which part needs a specific allocation
> > and does its itself, instead of abusing the netdev_priv storage.
> >
> > Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.
>
> +1
I can do this, but just FYI it's all of 16 bytes which is only mapped
for DMA temporarily - and not all of it - a byte here, a byte there.
The amount of hoops one has to jump through just to get 1 byte from
device nowdays is annoying.
--
MST
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Samudrala, Sridhar @ 2018-04-18 18:43 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <20180418092515.GB1989@nanopsycho>
On 4/18/2018 2:25 AM, Jiri Pirko wrote:
> Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
>> On 4/11/2018 8:51 AM, Jiri Pirko wrote:
>>> Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>>>> This provides a generic interface for paravirtual drivers to listen
>>>> for netdev register/unregister/link change events from pci ethernet
>>>> devices with the same MAC and takeover their datapath. The notifier and
>>>> event handling code is based on the existing netvsc implementation.
>>>>
>>>> It exposes 2 sets of interfaces to the paravirtual drivers.
>>>> 1. existing netvsc driver that uses 2 netdev model. In this model, no
>>>> master netdev is created. The paravirtual driver registers each bypass
>>>> instance along with a set of ops to manage the slave events.
>>>> bypass_master_register()
>>>> bypass_master_unregister()
>>>> 2. new virtio_net based solution that uses 3 netdev model. In this model,
>>>> the bypass module provides interfaces to create/destroy additional master
>>>> netdev and all the slave events are managed internally.
>>>> bypass_master_create()
>>>> bypass_master_destroy()
>>>>
>>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>>> ---
>>>> include/linux/netdevice.h | 14 +
>>>> include/net/bypass.h | 96 ++++++
>>>> net/Kconfig | 18 +
>>>> net/core/Makefile | 1 +
>>>> net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>>>> 5 files changed, 973 insertions(+)
>>>> create mode 100644 include/net/bypass.h
>>>> create mode 100644 net/core/bypass.c
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index cf44503ea81a..587293728f70 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>>>> IFF_PHONY_HEADROOM = 1<<24,
>>>> IFF_MACSEC = 1<<25,
>>>> IFF_NO_RX_HANDLER = 1<<26,
>>>> + IFF_BYPASS = 1 << 27,
>>>> + IFF_BYPASS_SLAVE = 1 << 28,
>>> I wonder, why you don't follow the existing coding style... Also, please
>>> add these to into the comment above.
>> To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
>> to the existing coding style to be consistent.
> Please do.
>
>
>>>
>>>> };
>>>>
>>>> #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
>>>> @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>>>> #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
>>>> #define IFF_MACSEC IFF_MACSEC
>>>> #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
>>>> +#define IFF_BYPASS IFF_BYPASS
>>>> +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
>>>>
>>>> /**
>>>> * struct net_device - The DEVICE structure.
>>>> @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>>>> return dev->priv_flags & IFF_RXFH_CONFIGURED;
>>>> }
>>>>
>>>> +static inline bool netif_is_bypass_master(const struct net_device *dev)
>>>> +{
>>>> + return dev->priv_flags & IFF_BYPASS;
>>>> +}
>>>> +
>>>> +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>>>> +{
>>>> + return dev->priv_flags & IFF_BYPASS_SLAVE;
>>>> +}
>>>> +
>>>> /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>>>> static inline void netif_keep_dst(struct net_device *dev)
>>>> {
>>>> diff --git a/include/net/bypass.h b/include/net/bypass.h
>>>> new file mode 100644
>>>> index 000000000000..86b02cb894cf
>>>> --- /dev/null
>>>> +++ b/include/net/bypass.h
>>>> @@ -0,0 +1,96 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/* Copyright (c) 2018, Intel Corporation. */
>>>> +
>>>> +#ifndef _NET_BYPASS_H
>>>> +#define _NET_BYPASS_H
>>>> +
>>>> +#include <linux/netdevice.h>
>>>> +
>>>> +struct bypass_ops {
>>>> + int (*slave_pre_register)(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev);
>>>> + int (*slave_join)(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev);
>>>> + int (*slave_pre_unregister)(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev);
>>>> + int (*slave_release)(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev);
>>>> + int (*slave_link_change)(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev);
>>>> + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>>>> +};
>>>> +
>>>> +struct bypass_master {
>>>> + struct list_head list;
>>>> + struct net_device __rcu *bypass_netdev;
>>>> + struct bypass_ops __rcu *ops;
>>>> +};
>>>> +
>>>> +/* bypass state */
>>>> +struct bypass_info {
>>>> + /* passthru netdev with same MAC */
>>>> + struct net_device __rcu *active_netdev;
>>> You still use "active"/"backup" names which is highly misleading as
>>> it has completely different meaning that in bond for example.
>>> I noted that in my previous review already. Please change it.
>> I guess the issue is with only the 'active' name. 'backup' should be fine as it also
>> matches with the BACKUP feature bit we are adding to virtio_net.
> I think that "backup" is also misleading. Both "active" and "backup"
> mean a *state* of slaves. This should be named differently.
>
>
>
>> With regards to alternate names for 'active', you suggested 'stolen', but i
>> am not too happy with it.
>> netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
> No. The netdev could be any netdevice. It does not have to be a "VF".
> I think "stolen" is quite appropriate since it describes the modus
> operandi. The bypass master steals some netdevice according to some
> match.
>
> But I don't insist on "stolen". Just sounds right.
We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
'backup' name is consistent.
The intent is to restrict the 'active' netdev to be a VF. If there is a way to check that
a PCI device is a VF in the guest kernel, we could restrict 'active' netdev to be a VF.
Will look for any suggestions in the next day or two. If i don't get any, i will go
with 'stolen'
<snip>
> +
> +static struct net_device *bypass_master_get_bymac(u8 *mac,
> + struct bypass_ops **ops)
> +{
> + struct bypass_master *bypass_master;
> + struct net_device *bypass_netdev;
> +
> + spin_lock(&bypass_lock);
> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
>>> As I wrote the last time, you don't need this list, spinlock.
>>> You can do just something like:
>>> for_each_net(net) {
>>> for_each_netdev(net, dev) {
>>> if (netif_is_bypass_master(dev)) {
>> This function returns the upper netdev as well as the ops associated
>> with that netdev.
>> bypass_master_list is a list of 'struct bypass_master' that associates
> Well, can't you have it in netdev priv?
We cannot do this for 2-netdev model as there is no bypass_netdev created.
>
>
>> 'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
>> We need 'ops' only to support the 2 netdev model of netvsc. ops will be
>> NULL for 3-netdev model.
> I see :(
>
>
>>
>>>
>>>
>>>
>>>> + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>>>> + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>>>> + *ops = rcu_dereference(bypass_master->ops);
>>> I don't see how rcu_dereference is ok here.
>>> 1) I don't see rcu_read_lock taken
>>> 2) Looks like bypass_master->ops has the same value across the whole
>>> existence.
>> We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
>> Yes. ops doesn't change.
> If it does not change, you can just access it directly.
>
>
>>>
>>>> + spin_unlock(&bypass_lock);
>>>> + return bypass_netdev;
>>>> + }
>>>> + }
>>>> + spin_unlock(&bypass_lock);
>>>> + return NULL;
>>>> +}
>>>> +
>>>> +static int bypass_slave_register(struct net_device *slave_netdev)
>>>> +{
>>>> + struct net_device *bypass_netdev;
>>>> + struct bypass_ops *bypass_ops;
>>>> + int ret, orig_mtu;
>>>> +
>>>> + ASSERT_RTNL();
>>>> +
>>>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>>>> + &bypass_ops);
>>> For master, could you use word "master" in the variables so it is clear?
>>> Also, "dev" is fine instead of "netdev".
>>> Something like "bpmaster_dev"
>> bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
> I was trying to point out, that "bypass_netdev" represents a "master"
> netdev, yet it does not say master. That is why I suggested
> "bpmaster_dev"
>
>
>> I can change all _netdev suffixes to _dev to make the names shorter.
> ok.
>
>
>>
>>>
>>>> + if (!bypass_netdev)
>>>> + goto done;
>>>> +
>>>> + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>>>> + bypass_ops);
>>>> + if (ret != 0)
>>> Just "if (ret)" will do. You have this on more places.
>> OK.
>>
>>
>>>
>>>> + goto done;
>>>> +
>>>> + ret = netdev_rx_handler_register(slave_netdev,
>>>> + bypass_ops ? bypass_ops->handle_frame :
>>>> + bypass_handle_frame, bypass_netdev);
>>>> + if (ret != 0) {
>>>> + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>>>> + ret);
>>>> + goto done;
>>>> + }
>>>> +
>>>> + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>>>> + if (ret != 0) {
>>>> + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>>>> + bypass_netdev->name, ret);
>>>> + goto upper_link_failed;
>>>> + }
>>>> +
>>>> + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>>>> +
>>>> + if (netif_running(bypass_netdev)) {
>>>> + ret = dev_open(slave_netdev);
>>>> + if (ret && (ret != -EBUSY)) {
>>>> + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>>>> + slave_netdev->name, ret);
>>>> + goto err_interface_up;
>>>> + }
>>>> + }
>>>> +
>>>> + /* Align MTU of slave with master */
>>>> + orig_mtu = slave_netdev->mtu;
>>>> + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>>>> + if (ret != 0) {
>>>> + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>>>> + slave_netdev->name, bypass_netdev->mtu);
>>>> + goto err_set_mtu;
>>>> + }
>>>> +
>>>> + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>>>> + if (ret != 0)
>>>> + goto err_join;
>>>> +
>>>> + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>>>> +
>>>> + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>>>> + slave_netdev->name);
>>>> +
>>>> + goto done;
>>>> +
>>>> +err_join:
>>>> + dev_set_mtu(slave_netdev, orig_mtu);
>>>> +err_set_mtu:
>>>> + dev_close(slave_netdev);
>>>> +err_interface_up:
>>>> + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>>>> + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>>>> +upper_link_failed:
>>>> + netdev_rx_handler_unregister(slave_netdev);
>>>> +done:
>>>> + return NOTIFY_DONE;
>>>> +}
>>>> +
>>>> +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev,
>>>> + struct bypass_ops *bypass_ops)
>>>> +{
>>>> + struct net_device *backup_netdev, *active_netdev;
>>>> + struct bypass_info *bi;
>>>> +
>>>> + if (bypass_ops) {
>>>> + if (!bypass_ops->slave_pre_unregister)
>>>> + return -EINVAL;
>>>> +
>>>> + return bypass_ops->slave_pre_unregister(slave_netdev,
>>>> + bypass_netdev);
>>>> + }
>>>> +
>>>> + bi = netdev_priv(bypass_netdev);
>>>> + active_netdev = rtnl_dereference(bi->active_netdev);
>>>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>>>> +
>>>> + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>>>> + return -EINVAL;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int bypass_slave_release(struct net_device *slave_netdev,
>>>> + struct net_device *bypass_netdev,
>>>> + struct bypass_ops *bypass_ops)
>>>> +{
>>>> + struct net_device *backup_netdev, *active_netdev;
>>>> + struct bypass_info *bi;
>>>> +
>>>> + if (bypass_ops) {
>>>> + if (!bypass_ops->slave_release)
>>>> + return -EINVAL;
>>> I think it would be good to make the API to the driver more strict and
>>> have a separate set of ops for "active" and "backup" netdevices.
>>> That should stop people thinking about extending this to more slaves in
>>> the future.
>> We have checks in slave_pre_register() that allows only 1 'backup' and 1
>> 'active' slave.
> I'm very well aware of that. I just thought that explicit ops for the
> two slaves would make this more clear.
>
>
>>
>>>
>>>
>>>> +
>>>> + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>>>> + }
>>>> +
>>>> + bi = netdev_priv(bypass_netdev);
>>>> + active_netdev = rtnl_dereference(bi->active_netdev);
>>>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>>>> +
>>>> + if (slave_netdev == backup_netdev) {
>>>> + RCU_INIT_POINTER(bi->backup_netdev, NULL);
>>>> + } else {
>>>> + RCU_INIT_POINTER(bi->active_netdev, NULL);
>>>> + if (backup_netdev) {
>>>> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>>>> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>>>> + }
>>>> + }
>>>> +
>>>> + dev_put(slave_netdev);
>>>> +
>>>> + netdev_info(bypass_netdev, "bypass slave:%s released\n",
>>>> + slave_netdev->name);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +int bypass_slave_unregister(struct net_device *slave_netdev)
>>>> +{
>>>> + struct net_device *bypass_netdev;
>>>> + struct bypass_ops *bypass_ops;
>>>> + int ret;
>>>> +
>>>> + if (!netif_is_bypass_slave(slave_netdev))
>>>> + goto done;
>>>> +
>>>> + ASSERT_RTNL();
>>>> +
>>>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>>>> + &bypass_ops);
>>>> + if (!bypass_netdev)
>>>> + goto done;
>>>> +
>>>> + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>>>> + bypass_ops);
>>>> + if (ret != 0)
>>>> + goto done;
>>>> +
>>>> + netdev_rx_handler_unregister(slave_netdev);
>>>> + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>>>> + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>>>> +
>>>> + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>>>> +
>>>> + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>>>> + slave_netdev->name);
>>>> +
>>>> +done:
>>>> + return NOTIFY_DONE;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>>>> +
>>>> +static bool bypass_xmit_ready(struct net_device *dev)
>>>> +{
>>>> + return netif_running(dev) && netif_carrier_ok(dev);
>>>> +}
>>>> +
>>>> +static int bypass_slave_link_change(struct net_device *slave_netdev)
>>>> +{
>>>> + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>>>> + struct bypass_ops *bypass_ops;
>>>> + struct bypass_info *bi;
>>>> +
>>>> + if (!netif_is_bypass_slave(slave_netdev))
>>>> + goto done;
>>>> +
>>>> + ASSERT_RTNL();
>>>> +
>>>> + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>>>> + &bypass_ops);
>>>> + if (!bypass_netdev)
>>>> + goto done;
>>>> +
>>>> + if (bypass_ops) {
>>>> + if (!bypass_ops->slave_link_change)
>>>> + goto done;
>>>> +
>>>> + return bypass_ops->slave_link_change(slave_netdev,
>>>> + bypass_netdev);
>>>> + }
>>>> +
>>>> + if (!netif_running(bypass_netdev))
>>>> + return 0;
>>>> +
>>>> + bi = netdev_priv(bypass_netdev);
>>>> +
>>>> + active_netdev = rtnl_dereference(bi->active_netdev);
>>>> + backup_netdev = rtnl_dereference(bi->backup_netdev);
>>>> +
>>>> + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>>>> + goto done;
>>> You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
>>> above is enough.
>> I think we need this check to not allow events from a slave that is not
>> attached to this master but has the same MAC.
> Why do we need such events? Seems wrong to me.
We want to avoid events from a netdev that is mis-configured with the same MAC as
a bypass setup.
> Consider:
>
> bp1 bp2
> a1 b1 a2 b2
>
>
> a1 and a2 have the same mac and bp1 and bp2 have the same mac.
We should not have 2 bypass configs with the same MAC.
I need to add a check in the bypass_master_register() to prevent this.
The above check is to avoid cases where we have
bp1(a1, b1) with mac1
and a2 is mis-configured with mac1, we want to avoid using a2 link events to update bp1.
> Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
> the order of creation.
> Let's say it will return bp1. Then when we have event for a2, the
> bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.
>
>
> You cannot use bypass_master_get_bymac() here.
>
>
>
>>>
>>>> +
>>>> + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>>>> + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>>>> + netif_carrier_on(bypass_netdev);
>>>> + netif_tx_wake_all_queues(bypass_netdev);
>>>> + } else {
>>>> + netif_carrier_off(bypass_netdev);
>>>> + netif_tx_stop_all_queues(bypass_netdev);
>>>> + }
>>>> +
>>>> +done:
>>>> + return NOTIFY_DONE;
>>>> +}
>>>> +
>>>> +static bool bypass_validate_event_dev(struct net_device *dev)
>>>> +{
>>>> + /* Skip parent events */
>>>> + if (netif_is_bypass_master(dev))
>>>> + return false;
>>>> +
>>>> + /* Avoid non-Ethernet type devices */
>>>> + if (dev->type != ARPHRD_ETHER)
>>>> + return false;
>>>> +
>>>> + /* Avoid Vlan dev with same MAC registering as VF */
>>>> + if (is_vlan_dev(dev))
>>>> + return false;
>>>> +
>>>> + /* Avoid Bonding master dev with same MAC registering as slave dev */
>>>> + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>>> Yeah, this is certainly incorrect. One thing is, you should be using the
>>> helpers netif_is_bond_master().
>>> But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>>>
>>> You need to do it not by blacklisting, but with whitelisting. You need
>>> to whitelist VF devices. My port flavours patchset might help with this.
>> May be i can use netdev_has_lower_dev() helper to make sure that the slave
> I don't see such function in the code.
It is netdev_has_any_lower_dev(). I need to export it.
>
>
>> device is not an upper dev.
>> Can you point to your port flavours patchset? Is it upstream?
> I sent rfc couple of weeks ago:
> [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-18 19:13 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <dd0d53f0-f5da-cb7d-f8f6-d0c8245eb3cf@intel.com>
Wed, Apr 18, 2018 at 08:43:15PM CEST, sridhar.samudrala@intel.com wrote:
>On 4/18/2018 2:25 AM, Jiri Pirko wrote:
>> Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
>> > On 4/11/2018 8:51 AM, Jiri Pirko wrote:
>> > > Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>> > > > This provides a generic interface for paravirtual drivers to listen
>> > > > for netdev register/unregister/link change events from pci ethernet
>> > > > devices with the same MAC and takeover their datapath. The notifier and
>> > > > event handling code is based on the existing netvsc implementation.
>> > > >
>> > > > It exposes 2 sets of interfaces to the paravirtual drivers.
>> > > > 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> > > > master netdev is created. The paravirtual driver registers each bypass
>> > > > instance along with a set of ops to manage the slave events.
>> > > > bypass_master_register()
>> > > > bypass_master_unregister()
>> > > > 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> > > > the bypass module provides interfaces to create/destroy additional master
>> > > > netdev and all the slave events are managed internally.
>> > > > bypass_master_create()
>> > > > bypass_master_destroy()
>> > > >
>> > > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> > > > ---
>> > > > include/linux/netdevice.h | 14 +
>> > > > include/net/bypass.h | 96 ++++++
>> > > > net/Kconfig | 18 +
>> > > > net/core/Makefile | 1 +
>> > > > net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>> > > > 5 files changed, 973 insertions(+)
>> > > > create mode 100644 include/net/bypass.h
>> > > > create mode 100644 net/core/bypass.c
>> > > >
>> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> > > > index cf44503ea81a..587293728f70 100644
>> > > > --- a/include/linux/netdevice.h
>> > > > +++ b/include/linux/netdevice.h
>> > > > @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>> > > > IFF_PHONY_HEADROOM = 1<<24,
>> > > > IFF_MACSEC = 1<<25,
>> > > > IFF_NO_RX_HANDLER = 1<<26,
>> > > > + IFF_BYPASS = 1 << 27,
>> > > > + IFF_BYPASS_SLAVE = 1 << 28,
>> > > I wonder, why you don't follow the existing coding style... Also, please
>> > > add these to into the comment above.
>> > To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
>> > to the existing coding style to be consistent.
>> Please do.
>>
>>
>> > >
>> > > > };
>> > > >
>> > > > #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
>> > > > @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>> > > > #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
>> > > > #define IFF_MACSEC IFF_MACSEC
>> > > > #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
>> > > > +#define IFF_BYPASS IFF_BYPASS
>> > > > +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
>> > > >
>> > > > /**
>> > > > * struct net_device - The DEVICE structure.
>> > > > @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>> > > > return dev->priv_flags & IFF_RXFH_CONFIGURED;
>> > > > }
>> > > >
>> > > > +static inline bool netif_is_bypass_master(const struct net_device *dev)
>> > > > +{
>> > > > + return dev->priv_flags & IFF_BYPASS;
>> > > > +}
>> > > > +
>> > > > +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>> > > > +{
>> > > > + return dev->priv_flags & IFF_BYPASS_SLAVE;
>> > > > +}
>> > > > +
>> > > > /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>> > > > static inline void netif_keep_dst(struct net_device *dev)
>> > > > {
>> > > > diff --git a/include/net/bypass.h b/include/net/bypass.h
>> > > > new file mode 100644
>> > > > index 000000000000..86b02cb894cf
>> > > > --- /dev/null
>> > > > +++ b/include/net/bypass.h
>> > > > @@ -0,0 +1,96 @@
>> > > > +// SPDX-License-Identifier: GPL-2.0
>> > > > +/* Copyright (c) 2018, Intel Corporation. */
>> > > > +
>> > > > +#ifndef _NET_BYPASS_H
>> > > > +#define _NET_BYPASS_H
>> > > > +
>> > > > +#include <linux/netdevice.h>
>> > > > +
>> > > > +struct bypass_ops {
>> > > > + int (*slave_pre_register)(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev);
>> > > > + int (*slave_join)(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev);
>> > > > + int (*slave_pre_unregister)(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev);
>> > > > + int (*slave_release)(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev);
>> > > > + int (*slave_link_change)(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev);
>> > > > + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> > > > +};
>> > > > +
>> > > > +struct bypass_master {
>> > > > + struct list_head list;
>> > > > + struct net_device __rcu *bypass_netdev;
>> > > > + struct bypass_ops __rcu *ops;
>> > > > +};
>> > > > +
>> > > > +/* bypass state */
>> > > > +struct bypass_info {
>> > > > + /* passthru netdev with same MAC */
>> > > > + struct net_device __rcu *active_netdev;
>> > > You still use "active"/"backup" names which is highly misleading as
>> > > it has completely different meaning that in bond for example.
>> > > I noted that in my previous review already. Please change it.
>> > I guess the issue is with only the 'active' name. 'backup' should be fine as it also
>> > matches with the BACKUP feature bit we are adding to virtio_net.
>> I think that "backup" is also misleading. Both "active" and "backup"
>> mean a *state* of slaves. This should be named differently.
>>
>>
>>
>> > With regards to alternate names for 'active', you suggested 'stolen', but i
>> > am not too happy with it.
>> > netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
>> No. The netdev could be any netdevice. It does not have to be a "VF".
>> I think "stolen" is quite appropriate since it describes the modus
>> operandi. The bypass master steals some netdevice according to some
>> match.
>>
>> But I don't insist on "stolen". Just sounds right.
>
>We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
>'backup' name is consistent.
It perhaps makes sense from the view of virtio device. However, as I
described couple of times, for master/slave device the name "backup" is
highly misleading.
>
>The intent is to restrict the 'active' netdev to be a VF. If there is a way to check that
>a PCI device is a VF in the guest kernel, we could restrict 'active' netdev to be a VF.
>
>Will look for any suggestions in the next day or two. If i don't get any, i will go
>with 'stolen'
>
><snip>
>
>
>> +
>> +static struct net_device *bypass_master_get_bymac(u8 *mac,
>> + struct bypass_ops **ops)
>> +{
>> + struct bypass_master *bypass_master;
>> + struct net_device *bypass_netdev;
>> +
>> + spin_lock(&bypass_lock);
>> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
>> > > As I wrote the last time, you don't need this list, spinlock.
>> > > You can do just something like:
>> > > for_each_net(net) {
>> > > for_each_netdev(net, dev) {
>> > > if (netif_is_bypass_master(dev)) {
>> > This function returns the upper netdev as well as the ops associated
>> > with that netdev.
>> > bypass_master_list is a list of 'struct bypass_master' that associates
>> Well, can't you have it in netdev priv?
>
>We cannot do this for 2-netdev model as there is no bypass_netdev created.
Howcome? You have no master? I don't understand..
>
>>
>>
>> > 'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
>> > We need 'ops' only to support the 2 netdev model of netvsc. ops will be
>> > NULL for 3-netdev model.
>> I see :(
>>
>>
>> >
>> > >
>> > >
>> > >
>> > > > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> > > > + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> > > > + *ops = rcu_dereference(bypass_master->ops);
>> > > I don't see how rcu_dereference is ok here.
>> > > 1) I don't see rcu_read_lock taken
>> > > 2) Looks like bypass_master->ops has the same value across the whole
>> > > existence.
>> > We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
>> > Yes. ops doesn't change.
>> If it does not change, you can just access it directly.
>>
>>
>> > >
>> > > > + spin_unlock(&bypass_lock);
>> > > > + return bypass_netdev;
>> > > > + }
>> > > > + }
>> > > > + spin_unlock(&bypass_lock);
>> > > > + return NULL;
>> > > > +}
>> > > > +
>> > > > +static int bypass_slave_register(struct net_device *slave_netdev)
>> > > > +{
>> > > > + struct net_device *bypass_netdev;
>> > > > + struct bypass_ops *bypass_ops;
>> > > > + int ret, orig_mtu;
>> > > > +
>> > > > + ASSERT_RTNL();
>> > > > +
>> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > > > + &bypass_ops);
>> > > For master, could you use word "master" in the variables so it is clear?
>> > > Also, "dev" is fine instead of "netdev".
>> > > Something like "bpmaster_dev"
>> > bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
>> I was trying to point out, that "bypass_netdev" represents a "master"
>> netdev, yet it does not say master. That is why I suggested
>> "bpmaster_dev"
>>
>>
>> > I can change all _netdev suffixes to _dev to make the names shorter.
>> ok.
>>
>>
>> >
>> > >
>> > > > + if (!bypass_netdev)
>> > > > + goto done;
>> > > > +
>> > > > + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>> > > > + bypass_ops);
>> > > > + if (ret != 0)
>> > > Just "if (ret)" will do. You have this on more places.
>> > OK.
>> >
>> >
>> > >
>> > > > + goto done;
>> > > > +
>> > > > + ret = netdev_rx_handler_register(slave_netdev,
>> > > > + bypass_ops ? bypass_ops->handle_frame :
>> > > > + bypass_handle_frame, bypass_netdev);
>> > > > + if (ret != 0) {
>> > > > + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>> > > > + ret);
>> > > > + goto done;
>> > > > + }
>> > > > +
>> > > > + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>> > > > + if (ret != 0) {
>> > > > + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>> > > > + bypass_netdev->name, ret);
>> > > > + goto upper_link_failed;
>> > > > + }
>> > > > +
>> > > > + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>> > > > +
>> > > > + if (netif_running(bypass_netdev)) {
>> > > > + ret = dev_open(slave_netdev);
>> > > > + if (ret && (ret != -EBUSY)) {
>> > > > + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>> > > > + slave_netdev->name, ret);
>> > > > + goto err_interface_up;
>> > > > + }
>> > > > + }
>> > > > +
>> > > > + /* Align MTU of slave with master */
>> > > > + orig_mtu = slave_netdev->mtu;
>> > > > + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>> > > > + if (ret != 0) {
>> > > > + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>> > > > + slave_netdev->name, bypass_netdev->mtu);
>> > > > + goto err_set_mtu;
>> > > > + }
>> > > > +
>> > > > + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>> > > > + if (ret != 0)
>> > > > + goto err_join;
>> > > > +
>> > > > + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>> > > > +
>> > > > + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>> > > > + slave_netdev->name);
>> > > > +
>> > > > + goto done;
>> > > > +
>> > > > +err_join:
>> > > > + dev_set_mtu(slave_netdev, orig_mtu);
>> > > > +err_set_mtu:
>> > > > + dev_close(slave_netdev);
>> > > > +err_interface_up:
>> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > > > +upper_link_failed:
>> > > > + netdev_rx_handler_unregister(slave_netdev);
>> > > > +done:
>> > > > + return NOTIFY_DONE;
>> > > > +}
>> > > > +
>> > > > +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev,
>> > > > + struct bypass_ops *bypass_ops)
>> > > > +{
>> > > > + struct net_device *backup_netdev, *active_netdev;
>> > > > + struct bypass_info *bi;
>> > > > +
>> > > > + if (bypass_ops) {
>> > > > + if (!bypass_ops->slave_pre_unregister)
>> > > > + return -EINVAL;
>> > > > +
>> > > > + return bypass_ops->slave_pre_unregister(slave_netdev,
>> > > > + bypass_netdev);
>> > > > + }
>> > > > +
>> > > > + bi = netdev_priv(bypass_netdev);
>> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > > > +
>> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > > > + return -EINVAL;
>> > > > +
>> > > > + return 0;
>> > > > +}
>> > > > +
>> > > > +static int bypass_slave_release(struct net_device *slave_netdev,
>> > > > + struct net_device *bypass_netdev,
>> > > > + struct bypass_ops *bypass_ops)
>> > > > +{
>> > > > + struct net_device *backup_netdev, *active_netdev;
>> > > > + struct bypass_info *bi;
>> > > > +
>> > > > + if (bypass_ops) {
>> > > > + if (!bypass_ops->slave_release)
>> > > > + return -EINVAL;
>> > > I think it would be good to make the API to the driver more strict and
>> > > have a separate set of ops for "active" and "backup" netdevices.
>> > > That should stop people thinking about extending this to more slaves in
>> > > the future.
>> > We have checks in slave_pre_register() that allows only 1 'backup' and 1
>> > 'active' slave.
>> I'm very well aware of that. I just thought that explicit ops for the
>> two slaves would make this more clear.
>>
>>
>> >
>> > >
>> > >
>> > > > +
>> > > > + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>> > > > + }
>> > > > +
>> > > > + bi = netdev_priv(bypass_netdev);
>> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > > > +
>> > > > + if (slave_netdev == backup_netdev) {
>> > > > + RCU_INIT_POINTER(bi->backup_netdev, NULL);
>> > > > + } else {
>> > > > + RCU_INIT_POINTER(bi->active_netdev, NULL);
>> > > > + if (backup_netdev) {
>> > > > + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> > > > + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> > > > + }
>> > > > + }
>> > > > +
>> > > > + dev_put(slave_netdev);
>> > > > +
>> > > > + netdev_info(bypass_netdev, "bypass slave:%s released\n",
>> > > > + slave_netdev->name);
>> > > > +
>> > > > + return 0;
>> > > > +}
>> > > > +
>> > > > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> > > > +{
>> > > > + struct net_device *bypass_netdev;
>> > > > + struct bypass_ops *bypass_ops;
>> > > > + int ret;
>> > > > +
>> > > > + if (!netif_is_bypass_slave(slave_netdev))
>> > > > + goto done;
>> > > > +
>> > > > + ASSERT_RTNL();
>> > > > +
>> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > > > + &bypass_ops);
>> > > > + if (!bypass_netdev)
>> > > > + goto done;
>> > > > +
>> > > > + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>> > > > + bypass_ops);
>> > > > + if (ret != 0)
>> > > > + goto done;
>> > > > +
>> > > > + netdev_rx_handler_unregister(slave_netdev);
>> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> > > > +
>> > > > + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>> > > > +
>> > > > + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>> > > > + slave_netdev->name);
>> > > > +
>> > > > +done:
>> > > > + return NOTIFY_DONE;
>> > > > +}
>> > > > +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>> > > > +
>> > > > +static bool bypass_xmit_ready(struct net_device *dev)
>> > > > +{
>> > > > + return netif_running(dev) && netif_carrier_ok(dev);
>> > > > +}
>> > > > +
>> > > > +static int bypass_slave_link_change(struct net_device *slave_netdev)
>> > > > +{
>> > > > + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>> > > > + struct bypass_ops *bypass_ops;
>> > > > + struct bypass_info *bi;
>> > > > +
>> > > > + if (!netif_is_bypass_slave(slave_netdev))
>> > > > + goto done;
>> > > > +
>> > > > + ASSERT_RTNL();
>> > > > +
>> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> > > > + &bypass_ops);
>> > > > + if (!bypass_netdev)
>> > > > + goto done;
>> > > > +
>> > > > + if (bypass_ops) {
>> > > > + if (!bypass_ops->slave_link_change)
>> > > > + goto done;
>> > > > +
>> > > > + return bypass_ops->slave_link_change(slave_netdev,
>> > > > + bypass_netdev);
>> > > > + }
>> > > > +
>> > > > + if (!netif_running(bypass_netdev))
>> > > > + return 0;
>> > > > +
>> > > > + bi = netdev_priv(bypass_netdev);
>> > > > +
>> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> > > > +
>> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> > > > + goto done;
>> > > You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
>> > > above is enough.
>> > I think we need this check to not allow events from a slave that is not
>> > attached to this master but has the same MAC.
>> Why do we need such events? Seems wrong to me.
>
>We want to avoid events from a netdev that is mis-configured with the same MAC as
>a bypass setup.
>
>> Consider:
>>
>> bp1 bp2
>> a1 b1 a2 b2
>>
>>
>> a1 and a2 have the same mac and bp1 and bp2 have the same mac.
>
>We should not have 2 bypass configs with the same MAC.
>I need to add a check in the bypass_master_register() to prevent this.
Mac can change, you would have to check in change as well. Feels odd
thought.
>
>The above check is to avoid cases where we have
>bp1(a1, b1) with mac1
>and a2 is mis-configured with mac1, we want to avoid using a2 link events to update bp1.
>
>> Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
>> the order of creation.
>> Let's say it will return bp1. Then when we have event for a2, the
>> bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.
>>
>>
>> You cannot use bypass_master_get_bymac() here.
>>
>>
>>
>> > >
>> > > > +
>> > > > + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>> > > > + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>> > > > + netif_carrier_on(bypass_netdev);
>> > > > + netif_tx_wake_all_queues(bypass_netdev);
>> > > > + } else {
>> > > > + netif_carrier_off(bypass_netdev);
>> > > > + netif_tx_stop_all_queues(bypass_netdev);
>> > > > + }
>> > > > +
>> > > > +done:
>> > > > + return NOTIFY_DONE;
>> > > > +}
>> > > > +
>> > > > +static bool bypass_validate_event_dev(struct net_device *dev)
>> > > > +{
>> > > > + /* Skip parent events */
>> > > > + if (netif_is_bypass_master(dev))
>> > > > + return false;
>> > > > +
>> > > > + /* Avoid non-Ethernet type devices */
>> > > > + if (dev->type != ARPHRD_ETHER)
>> > > > + return false;
>> > > > +
>> > > > + /* Avoid Vlan dev with same MAC registering as VF */
>> > > > + if (is_vlan_dev(dev))
>> > > > + return false;
>> > > > +
>> > > > + /* Avoid Bonding master dev with same MAC registering as slave dev */
>> > > > + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> > > Yeah, this is certainly incorrect. One thing is, you should be using the
>> > > helpers netif_is_bond_master().
>> > > But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>> > >
>> > > You need to do it not by blacklisting, but with whitelisting. You need
>> > > to whitelist VF devices. My port flavours patchset might help with this.
>> > May be i can use netdev_has_lower_dev() helper to make sure that the slave
>> I don't see such function in the code.
>
>It is netdev_has_any_lower_dev(). I need to export it.
Come on, you cannot use that. That would allow bonding without slaves,
but the slaves could be added later on.
What exactly you are trying to achieve by this?
>
>>
>>
>> > device is not an upper dev.
>> > Can you point to your port flavours patchset? Is it upstream?
>> I sent rfc couple of weeks ago:
>> [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
>
>
>
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Michael S. Tsirkin @ 2018-04-18 19:46 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, kubakici, Samudrala, Sridhar,
virtualization, loseweigh, netdev, davem
In-Reply-To: <20180418191315.GA1922@nanopsycho>
On Wed, Apr 18, 2018 at 09:13:15PM +0200, Jiri Pirko wrote:
> Wed, Apr 18, 2018 at 08:43:15PM CEST, sridhar.samudrala@intel.com wrote:
> >On 4/18/2018 2:25 AM, Jiri Pirko wrote:
> >> Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
> >> > On 4/11/2018 8:51 AM, Jiri Pirko wrote:
> >> > > Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
> >> > > > This provides a generic interface for paravirtual drivers to listen
> >> > > > for netdev register/unregister/link change events from pci ethernet
> >> > > > devices with the same MAC and takeover their datapath. The notifier and
> >> > > > event handling code is based on the existing netvsc implementation.
> >> > > >
> >> > > > It exposes 2 sets of interfaces to the paravirtual drivers.
> >> > > > 1. existing netvsc driver that uses 2 netdev model. In this model, no
> >> > > > master netdev is created. The paravirtual driver registers each bypass
> >> > > > instance along with a set of ops to manage the slave events.
> >> > > > bypass_master_register()
> >> > > > bypass_master_unregister()
> >> > > > 2. new virtio_net based solution that uses 3 netdev model. In this model,
> >> > > > the bypass module provides interfaces to create/destroy additional master
> >> > > > netdev and all the slave events are managed internally.
> >> > > > bypass_master_create()
> >> > > > bypass_master_destroy()
> >> > > >
> >> > > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> >> > > > ---
> >> > > > include/linux/netdevice.h | 14 +
> >> > > > include/net/bypass.h | 96 ++++++
> >> > > > net/Kconfig | 18 +
> >> > > > net/core/Makefile | 1 +
> >> > > > net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
> >> > > > 5 files changed, 973 insertions(+)
> >> > > > create mode 100644 include/net/bypass.h
> >> > > > create mode 100644 net/core/bypass.c
> >> > > >
> >> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >> > > > index cf44503ea81a..587293728f70 100644
> >> > > > --- a/include/linux/netdevice.h
> >> > > > +++ b/include/linux/netdevice.h
> >> > > > @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
> >> > > > IFF_PHONY_HEADROOM = 1<<24,
> >> > > > IFF_MACSEC = 1<<25,
> >> > > > IFF_NO_RX_HANDLER = 1<<26,
> >> > > > + IFF_BYPASS = 1 << 27,
> >> > > > + IFF_BYPASS_SLAVE = 1 << 28,
> >> > > I wonder, why you don't follow the existing coding style... Also, please
> >> > > add these to into the comment above.
> >> > To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
> >> > to the existing coding style to be consistent.
> >> Please do.
> >>
> >>
> >> > >
> >> > > > };
> >> > > >
> >> > > > #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
> >> > > > @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
> >> > > > #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
> >> > > > #define IFF_MACSEC IFF_MACSEC
> >> > > > #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
> >> > > > +#define IFF_BYPASS IFF_BYPASS
> >> > > > +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
> >> > > >
> >> > > > /**
> >> > > > * struct net_device - The DEVICE structure.
> >> > > > @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
> >> > > > return dev->priv_flags & IFF_RXFH_CONFIGURED;
> >> > > > }
> >> > > >
> >> > > > +static inline bool netif_is_bypass_master(const struct net_device *dev)
> >> > > > +{
> >> > > > + return dev->priv_flags & IFF_BYPASS;
> >> > > > +}
> >> > > > +
> >> > > > +static inline bool netif_is_bypass_slave(const struct net_device *dev)
> >> > > > +{
> >> > > > + return dev->priv_flags & IFF_BYPASS_SLAVE;
> >> > > > +}
> >> > > > +
> >> > > > /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
> >> > > > static inline void netif_keep_dst(struct net_device *dev)
> >> > > > {
> >> > > > diff --git a/include/net/bypass.h b/include/net/bypass.h
> >> > > > new file mode 100644
> >> > > > index 000000000000..86b02cb894cf
> >> > > > --- /dev/null
> >> > > > +++ b/include/net/bypass.h
> >> > > > @@ -0,0 +1,96 @@
> >> > > > +// SPDX-License-Identifier: GPL-2.0
> >> > > > +/* Copyright (c) 2018, Intel Corporation. */
> >> > > > +
> >> > > > +#ifndef _NET_BYPASS_H
> >> > > > +#define _NET_BYPASS_H
> >> > > > +
> >> > > > +#include <linux/netdevice.h>
> >> > > > +
> >> > > > +struct bypass_ops {
> >> > > > + int (*slave_pre_register)(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev);
> >> > > > + int (*slave_join)(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev);
> >> > > > + int (*slave_pre_unregister)(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev);
> >> > > > + int (*slave_release)(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev);
> >> > > > + int (*slave_link_change)(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev);
> >> > > > + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
> >> > > > +};
> >> > > > +
> >> > > > +struct bypass_master {
> >> > > > + struct list_head list;
> >> > > > + struct net_device __rcu *bypass_netdev;
> >> > > > + struct bypass_ops __rcu *ops;
> >> > > > +};
> >> > > > +
> >> > > > +/* bypass state */
> >> > > > +struct bypass_info {
> >> > > > + /* passthru netdev with same MAC */
> >> > > > + struct net_device __rcu *active_netdev;
> >> > > You still use "active"/"backup" names which is highly misleading as
> >> > > it has completely different meaning that in bond for example.
> >> > > I noted that in my previous review already. Please change it.
> >> > I guess the issue is with only the 'active' name. 'backup' should be fine as it also
> >> > matches with the BACKUP feature bit we are adding to virtio_net.
> >> I think that "backup" is also misleading. Both "active" and "backup"
> >> mean a *state* of slaves. This should be named differently.
> >>
> >>
> >>
> >> > With regards to alternate names for 'active', you suggested 'stolen', but i
> >> > am not too happy with it.
> >> > netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
> >> No. The netdev could be any netdevice. It does not have to be a "VF".
> >> I think "stolen" is quite appropriate since it describes the modus
> >> operandi. The bypass master steals some netdevice according to some
> >> match.
> >>
> >> But I don't insist on "stolen". Just sounds right.
> >
> >We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
> >'backup' name is consistent.
>
> It perhaps makes sense from the view of virtio device. However, as I
> described couple of times, for master/slave device the name "backup" is
> highly misleading.
virtio is the backup. You are supposed to use another
(typically passthrough) device, if that fails use virtio.
It does seem appropriate to me. If you like, we can
change that to "standby". Active I don't like either. "main"?
In fact would failover be better than bypass?
>
> >
> >The intent is to restrict the 'active' netdev to be a VF. If there is a way to check that
> >a PCI device is a VF in the guest kernel, we could restrict 'active' netdev to be a VF.
> >
> >Will look for any suggestions in the next day or two. If i don't get any, i will go
> >with 'stolen'
> >
> ><snip>
> >
> >
> >> +
> >> +static struct net_device *bypass_master_get_bymac(u8 *mac,
> >> + struct bypass_ops **ops)
> >> +{
> >> + struct bypass_master *bypass_master;
> >> + struct net_device *bypass_netdev;
> >> +
> >> + spin_lock(&bypass_lock);
> >> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
> >> > > As I wrote the last time, you don't need this list, spinlock.
> >> > > You can do just something like:
> >> > > for_each_net(net) {
> >> > > for_each_netdev(net, dev) {
> >> > > if (netif_is_bypass_master(dev)) {
> >> > This function returns the upper netdev as well as the ops associated
> >> > with that netdev.
> >> > bypass_master_list is a list of 'struct bypass_master' that associates
> >> Well, can't you have it in netdev priv?
> >
> >We cannot do this for 2-netdev model as there is no bypass_netdev created.
>
> Howcome? You have no master? I don't understand..
>
>
>
> >
> >>
> >>
> >> > 'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
> >> > We need 'ops' only to support the 2 netdev model of netvsc. ops will be
> >> > NULL for 3-netdev model.
> >> I see :(
> >>
> >>
> >> >
> >> > >
> >> > >
> >> > >
> >> > > > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
> >> > > > + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
> >> > > > + *ops = rcu_dereference(bypass_master->ops);
> >> > > I don't see how rcu_dereference is ok here.
> >> > > 1) I don't see rcu_read_lock taken
> >> > > 2) Looks like bypass_master->ops has the same value across the whole
> >> > > existence.
> >> > We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
> >> > Yes. ops doesn't change.
> >> If it does not change, you can just access it directly.
> >>
> >>
> >> > >
> >> > > > + spin_unlock(&bypass_lock);
> >> > > > + return bypass_netdev;
> >> > > > + }
> >> > > > + }
> >> > > > + spin_unlock(&bypass_lock);
> >> > > > + return NULL;
> >> > > > +}
> >> > > > +
> >> > > > +static int bypass_slave_register(struct net_device *slave_netdev)
> >> > > > +{
> >> > > > + struct net_device *bypass_netdev;
> >> > > > + struct bypass_ops *bypass_ops;
> >> > > > + int ret, orig_mtu;
> >> > > > +
> >> > > > + ASSERT_RTNL();
> >> > > > +
> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
> >> > > > + &bypass_ops);
> >> > > For master, could you use word "master" in the variables so it is clear?
> >> > > Also, "dev" is fine instead of "netdev".
> >> > > Something like "bpmaster_dev"
> >> > bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
> >> I was trying to point out, that "bypass_netdev" represents a "master"
> >> netdev, yet it does not say master. That is why I suggested
> >> "bpmaster_dev"
> >>
> >>
> >> > I can change all _netdev suffixes to _dev to make the names shorter.
> >> ok.
> >>
> >>
> >> >
> >> > >
> >> > > > + if (!bypass_netdev)
> >> > > > + goto done;
> >> > > > +
> >> > > > + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
> >> > > > + bypass_ops);
> >> > > > + if (ret != 0)
> >> > > Just "if (ret)" will do. You have this on more places.
> >> > OK.
> >> >
> >> >
> >> > >
> >> > > > + goto done;
> >> > > > +
> >> > > > + ret = netdev_rx_handler_register(slave_netdev,
> >> > > > + bypass_ops ? bypass_ops->handle_frame :
> >> > > > + bypass_handle_frame, bypass_netdev);
> >> > > > + if (ret != 0) {
> >> > > > + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
> >> > > > + ret);
> >> > > > + goto done;
> >> > > > + }
> >> > > > +
> >> > > > + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
> >> > > > + if (ret != 0) {
> >> > > > + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
> >> > > > + bypass_netdev->name, ret);
> >> > > > + goto upper_link_failed;
> >> > > > + }
> >> > > > +
> >> > > > + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
> >> > > > +
> >> > > > + if (netif_running(bypass_netdev)) {
> >> > > > + ret = dev_open(slave_netdev);
> >> > > > + if (ret && (ret != -EBUSY)) {
> >> > > > + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
> >> > > > + slave_netdev->name, ret);
> >> > > > + goto err_interface_up;
> >> > > > + }
> >> > > > + }
> >> > > > +
> >> > > > + /* Align MTU of slave with master */
> >> > > > + orig_mtu = slave_netdev->mtu;
> >> > > > + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
> >> > > > + if (ret != 0) {
> >> > > > + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
> >> > > > + slave_netdev->name, bypass_netdev->mtu);
> >> > > > + goto err_set_mtu;
> >> > > > + }
> >> > > > +
> >> > > > + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
> >> > > > + if (ret != 0)
> >> > > > + goto err_join;
> >> > > > +
> >> > > > + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
> >> > > > +
> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
> >> > > > + slave_netdev->name);
> >> > > > +
> >> > > > + goto done;
> >> > > > +
> >> > > > +err_join:
> >> > > > + dev_set_mtu(slave_netdev, orig_mtu);
> >> > > > +err_set_mtu:
> >> > > > + dev_close(slave_netdev);
> >> > > > +err_interface_up:
> >> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
> >> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
> >> > > > +upper_link_failed:
> >> > > > + netdev_rx_handler_unregister(slave_netdev);
> >> > > > +done:
> >> > > > + return NOTIFY_DONE;
> >> > > > +}
> >> > > > +
> >> > > > +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev,
> >> > > > + struct bypass_ops *bypass_ops)
> >> > > > +{
> >> > > > + struct net_device *backup_netdev, *active_netdev;
> >> > > > + struct bypass_info *bi;
> >> > > > +
> >> > > > + if (bypass_ops) {
> >> > > > + if (!bypass_ops->slave_pre_unregister)
> >> > > > + return -EINVAL;
> >> > > > +
> >> > > > + return bypass_ops->slave_pre_unregister(slave_netdev,
> >> > > > + bypass_netdev);
> >> > > > + }
> >> > > > +
> >> > > > + bi = netdev_priv(bypass_netdev);
> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
> >> > > > +
> >> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
> >> > > > + return -EINVAL;
> >> > > > +
> >> > > > + return 0;
> >> > > > +}
> >> > > > +
> >> > > > +static int bypass_slave_release(struct net_device *slave_netdev,
> >> > > > + struct net_device *bypass_netdev,
> >> > > > + struct bypass_ops *bypass_ops)
> >> > > > +{
> >> > > > + struct net_device *backup_netdev, *active_netdev;
> >> > > > + struct bypass_info *bi;
> >> > > > +
> >> > > > + if (bypass_ops) {
> >> > > > + if (!bypass_ops->slave_release)
> >> > > > + return -EINVAL;
> >> > > I think it would be good to make the API to the driver more strict and
> >> > > have a separate set of ops for "active" and "backup" netdevices.
> >> > > That should stop people thinking about extending this to more slaves in
> >> > > the future.
> >> > We have checks in slave_pre_register() that allows only 1 'backup' and 1
> >> > 'active' slave.
> >> I'm very well aware of that. I just thought that explicit ops for the
> >> two slaves would make this more clear.
> >>
> >>
> >> >
> >> > >
> >> > >
> >> > > > +
> >> > > > + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
> >> > > > + }
> >> > > > +
> >> > > > + bi = netdev_priv(bypass_netdev);
> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
> >> > > > +
> >> > > > + if (slave_netdev == backup_netdev) {
> >> > > > + RCU_INIT_POINTER(bi->backup_netdev, NULL);
> >> > > > + } else {
> >> > > > + RCU_INIT_POINTER(bi->active_netdev, NULL);
> >> > > > + if (backup_netdev) {
> >> > > > + bypass_netdev->min_mtu = backup_netdev->min_mtu;
> >> > > > + bypass_netdev->max_mtu = backup_netdev->max_mtu;
> >> > > > + }
> >> > > > + }
> >> > > > +
> >> > > > + dev_put(slave_netdev);
> >> > > > +
> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s released\n",
> >> > > > + slave_netdev->name);
> >> > > > +
> >> > > > + return 0;
> >> > > > +}
> >> > > > +
> >> > > > +int bypass_slave_unregister(struct net_device *slave_netdev)
> >> > > > +{
> >> > > > + struct net_device *bypass_netdev;
> >> > > > + struct bypass_ops *bypass_ops;
> >> > > > + int ret;
> >> > > > +
> >> > > > + if (!netif_is_bypass_slave(slave_netdev))
> >> > > > + goto done;
> >> > > > +
> >> > > > + ASSERT_RTNL();
> >> > > > +
> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
> >> > > > + &bypass_ops);
> >> > > > + if (!bypass_netdev)
> >> > > > + goto done;
> >> > > > +
> >> > > > + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
> >> > > > + bypass_ops);
> >> > > > + if (ret != 0)
> >> > > > + goto done;
> >> > > > +
> >> > > > + netdev_rx_handler_unregister(slave_netdev);
> >> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
> >> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
> >> > > > +
> >> > > > + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
> >> > > > +
> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
> >> > > > + slave_netdev->name);
> >> > > > +
> >> > > > +done:
> >> > > > + return NOTIFY_DONE;
> >> > > > +}
> >> > > > +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
> >> > > > +
> >> > > > +static bool bypass_xmit_ready(struct net_device *dev)
> >> > > > +{
> >> > > > + return netif_running(dev) && netif_carrier_ok(dev);
> >> > > > +}
> >> > > > +
> >> > > > +static int bypass_slave_link_change(struct net_device *slave_netdev)
> >> > > > +{
> >> > > > + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
> >> > > > + struct bypass_ops *bypass_ops;
> >> > > > + struct bypass_info *bi;
> >> > > > +
> >> > > > + if (!netif_is_bypass_slave(slave_netdev))
> >> > > > + goto done;
> >> > > > +
> >> > > > + ASSERT_RTNL();
> >> > > > +
> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
> >> > > > + &bypass_ops);
> >> > > > + if (!bypass_netdev)
> >> > > > + goto done;
> >> > > > +
> >> > > > + if (bypass_ops) {
> >> > > > + if (!bypass_ops->slave_link_change)
> >> > > > + goto done;
> >> > > > +
> >> > > > + return bypass_ops->slave_link_change(slave_netdev,
> >> > > > + bypass_netdev);
> >> > > > + }
> >> > > > +
> >> > > > + if (!netif_running(bypass_netdev))
> >> > > > + return 0;
> >> > > > +
> >> > > > + bi = netdev_priv(bypass_netdev);
> >> > > > +
> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
> >> > > > +
> >> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
> >> > > > + goto done;
> >> > > You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
> >> > > above is enough.
> >> > I think we need this check to not allow events from a slave that is not
> >> > attached to this master but has the same MAC.
> >> Why do we need such events? Seems wrong to me.
> >
> >We want to avoid events from a netdev that is mis-configured with the same MAC as
> >a bypass setup.
> >
> >> Consider:
> >>
> >> bp1 bp2
> >> a1 b1 a2 b2
> >>
> >>
> >> a1 and a2 have the same mac and bp1 and bp2 have the same mac.
> >
> >We should not have 2 bypass configs with the same MAC.
> >I need to add a check in the bypass_master_register() to prevent this.
>
> Mac can change, you would have to check in change as well. Feels odd
> thought.
>
>
> >
> >The above check is to avoid cases where we have
> >bp1(a1, b1) with mac1
> >and a2 is mis-configured with mac1, we want to avoid using a2 link events to update bp1.
> >
> >> Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
> >> the order of creation.
> >> Let's say it will return bp1. Then when we have event for a2, the
> >> bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.
> >>
> >>
> >> You cannot use bypass_master_get_bymac() here.
> >>
> >>
> >>
> >> > >
> >> > > > +
> >> > > > + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
> >> > > > + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
> >> > > > + netif_carrier_on(bypass_netdev);
> >> > > > + netif_tx_wake_all_queues(bypass_netdev);
> >> > > > + } else {
> >> > > > + netif_carrier_off(bypass_netdev);
> >> > > > + netif_tx_stop_all_queues(bypass_netdev);
> >> > > > + }
> >> > > > +
> >> > > > +done:
> >> > > > + return NOTIFY_DONE;
> >> > > > +}
> >> > > > +
> >> > > > +static bool bypass_validate_event_dev(struct net_device *dev)
> >> > > > +{
> >> > > > + /* Skip parent events */
> >> > > > + if (netif_is_bypass_master(dev))
> >> > > > + return false;
> >> > > > +
> >> > > > + /* Avoid non-Ethernet type devices */
> >> > > > + if (dev->type != ARPHRD_ETHER)
> >> > > > + return false;
> >> > > > +
> >> > > > + /* Avoid Vlan dev with same MAC registering as VF */
> >> > > > + if (is_vlan_dev(dev))
> >> > > > + return false;
> >> > > > +
> >> > > > + /* Avoid Bonding master dev with same MAC registering as slave dev */
> >> > > > + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
> >> > > Yeah, this is certainly incorrect. One thing is, you should be using the
> >> > > helpers netif_is_bond_master().
> >> > > But what about the rest? macsec, macvlan, team, bridge, ovs and others?
> >> > >
> >> > > You need to do it not by blacklisting, but with whitelisting. You need
> >> > > to whitelist VF devices. My port flavours patchset might help with this.
> >> > May be i can use netdev_has_lower_dev() helper to make sure that the slave
> >> I don't see such function in the code.
> >
> >It is netdev_has_any_lower_dev(). I need to export it.
>
> Come on, you cannot use that. That would allow bonding without slaves,
> but the slaves could be added later on.
>
> What exactly you are trying to achieve by this?
>
>
> >
> >>
> >>
> >> > device is not an upper dev.
> >> > Can you point to your port flavours patchset? Is it upstream?
> >> I sent rfc couple of weeks ago:
> >> [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
> >
> >
> >
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-18 20:32 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: alexander.h.duyck, virtio-dev, kubakici, Samudrala, Sridhar,
virtualization, loseweigh, netdev, davem
In-Reply-To: <20180418222309-mutt-send-email-mst@kernel.org>
Wed, Apr 18, 2018 at 09:46:04PM CEST, mst@redhat.com wrote:
>On Wed, Apr 18, 2018 at 09:13:15PM +0200, Jiri Pirko wrote:
>> Wed, Apr 18, 2018 at 08:43:15PM CEST, sridhar.samudrala@intel.com wrote:
>> >On 4/18/2018 2:25 AM, Jiri Pirko wrote:
>> >> Wed, Apr 11, 2018 at 09:13:52PM CEST, sridhar.samudrala@intel.com wrote:
>> >> > On 4/11/2018 8:51 AM, Jiri Pirko wrote:
>> >> > > Tue, Apr 10, 2018 at 08:59:48PM CEST, sridhar.samudrala@intel.com wrote:
>> >> > > > This provides a generic interface for paravirtual drivers to listen
>> >> > > > for netdev register/unregister/link change events from pci ethernet
>> >> > > > devices with the same MAC and takeover their datapath. The notifier and
>> >> > > > event handling code is based on the existing netvsc implementation.
>> >> > > >
>> >> > > > It exposes 2 sets of interfaces to the paravirtual drivers.
>> >> > > > 1. existing netvsc driver that uses 2 netdev model. In this model, no
>> >> > > > master netdev is created. The paravirtual driver registers each bypass
>> >> > > > instance along with a set of ops to manage the slave events.
>> >> > > > bypass_master_register()
>> >> > > > bypass_master_unregister()
>> >> > > > 2. new virtio_net based solution that uses 3 netdev model. In this model,
>> >> > > > the bypass module provides interfaces to create/destroy additional master
>> >> > > > netdev and all the slave events are managed internally.
>> >> > > > bypass_master_create()
>> >> > > > bypass_master_destroy()
>> >> > > >
>> >> > > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> >> > > > ---
>> >> > > > include/linux/netdevice.h | 14 +
>> >> > > > include/net/bypass.h | 96 ++++++
>> >> > > > net/Kconfig | 18 +
>> >> > > > net/core/Makefile | 1 +
>> >> > > > net/core/bypass.c | 844 ++++++++++++++++++++++++++++++++++++++++++++++
>> >> > > > 5 files changed, 973 insertions(+)
>> >> > > > create mode 100644 include/net/bypass.h
>> >> > > > create mode 100644 net/core/bypass.c
>> >> > > >
>> >> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> >> > > > index cf44503ea81a..587293728f70 100644
>> >> > > > --- a/include/linux/netdevice.h
>> >> > > > +++ b/include/linux/netdevice.h
>> >> > > > @@ -1430,6 +1430,8 @@ enum netdev_priv_flags {
>> >> > > > IFF_PHONY_HEADROOM = 1<<24,
>> >> > > > IFF_MACSEC = 1<<25,
>> >> > > > IFF_NO_RX_HANDLER = 1<<26,
>> >> > > > + IFF_BYPASS = 1 << 27,
>> >> > > > + IFF_BYPASS_SLAVE = 1 << 28,
>> >> > > I wonder, why you don't follow the existing coding style... Also, please
>> >> > > add these to into the comment above.
>> >> > To avoid checkpatch warnings. If it is OK to ignore these warnings, I can switch back
>> >> > to the existing coding style to be consistent.
>> >> Please do.
>> >>
>> >>
>> >> > >
>> >> > > > };
>> >> > > >
>> >> > > > #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
>> >> > > > @@ -1458,6 +1460,8 @@ enum netdev_priv_flags {
>> >> > > > #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
>> >> > > > #define IFF_MACSEC IFF_MACSEC
>> >> > > > #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
>> >> > > > +#define IFF_BYPASS IFF_BYPASS
>> >> > > > +#define IFF_BYPASS_SLAVE IFF_BYPASS_SLAVE
>> >> > > >
>> >> > > > /**
>> >> > > > * struct net_device - The DEVICE structure.
>> >> > > > @@ -4308,6 +4312,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
>> >> > > > return dev->priv_flags & IFF_RXFH_CONFIGURED;
>> >> > > > }
>> >> > > >
>> >> > > > +static inline bool netif_is_bypass_master(const struct net_device *dev)
>> >> > > > +{
>> >> > > > + return dev->priv_flags & IFF_BYPASS;
>> >> > > > +}
>> >> > > > +
>> >> > > > +static inline bool netif_is_bypass_slave(const struct net_device *dev)
>> >> > > > +{
>> >> > > > + return dev->priv_flags & IFF_BYPASS_SLAVE;
>> >> > > > +}
>> >> > > > +
>> >> > > > /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
>> >> > > > static inline void netif_keep_dst(struct net_device *dev)
>> >> > > > {
>> >> > > > diff --git a/include/net/bypass.h b/include/net/bypass.h
>> >> > > > new file mode 100644
>> >> > > > index 000000000000..86b02cb894cf
>> >> > > > --- /dev/null
>> >> > > > +++ b/include/net/bypass.h
>> >> > > > @@ -0,0 +1,96 @@
>> >> > > > +// SPDX-License-Identifier: GPL-2.0
>> >> > > > +/* Copyright (c) 2018, Intel Corporation. */
>> >> > > > +
>> >> > > > +#ifndef _NET_BYPASS_H
>> >> > > > +#define _NET_BYPASS_H
>> >> > > > +
>> >> > > > +#include <linux/netdevice.h>
>> >> > > > +
>> >> > > > +struct bypass_ops {
>> >> > > > + int (*slave_pre_register)(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev);
>> >> > > > + int (*slave_join)(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev);
>> >> > > > + int (*slave_pre_unregister)(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev);
>> >> > > > + int (*slave_release)(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev);
>> >> > > > + int (*slave_link_change)(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev);
>> >> > > > + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> >> > > > +};
>> >> > > > +
>> >> > > > +struct bypass_master {
>> >> > > > + struct list_head list;
>> >> > > > + struct net_device __rcu *bypass_netdev;
>> >> > > > + struct bypass_ops __rcu *ops;
>> >> > > > +};
>> >> > > > +
>> >> > > > +/* bypass state */
>> >> > > > +struct bypass_info {
>> >> > > > + /* passthru netdev with same MAC */
>> >> > > > + struct net_device __rcu *active_netdev;
>> >> > > You still use "active"/"backup" names which is highly misleading as
>> >> > > it has completely different meaning that in bond for example.
>> >> > > I noted that in my previous review already. Please change it.
>> >> > I guess the issue is with only the 'active' name. 'backup' should be fine as it also
>> >> > matches with the BACKUP feature bit we are adding to virtio_net.
>> >> I think that "backup" is also misleading. Both "active" and "backup"
>> >> mean a *state* of slaves. This should be named differently.
>> >>
>> >>
>> >>
>> >> > With regards to alternate names for 'active', you suggested 'stolen', but i
>> >> > am not too happy with it.
>> >> > netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
>> >> No. The netdev could be any netdevice. It does not have to be a "VF".
>> >> I think "stolen" is quite appropriate since it describes the modus
>> >> operandi. The bypass master steals some netdevice according to some
>> >> match.
>> >>
>> >> But I don't insist on "stolen". Just sounds right.
>> >
>> >We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
>> >'backup' name is consistent.
>>
>> It perhaps makes sense from the view of virtio device. However, as I
>> described couple of times, for master/slave device the name "backup" is
>> highly misleading.
>
>virtio is the backup. You are supposed to use another
>(typically passthrough) device, if that fails use virtio.
>It does seem appropriate to me. If you like, we can
>change that to "standby". Active I don't like either. "main"?
Sounds much better, yes.
>
>In fact would failover be better than bypass?
Also, much better.
>
>
>>
>> >
>> >The intent is to restrict the 'active' netdev to be a VF. If there is a way to check that
>> >a PCI device is a VF in the guest kernel, we could restrict 'active' netdev to be a VF.
>> >
>> >Will look for any suggestions in the next day or two. If i don't get any, i will go
>> >with 'stolen'
>> >
>> ><snip>
>> >
>> >
>> >> +
>> >> +static struct net_device *bypass_master_get_bymac(u8 *mac,
>> >> + struct bypass_ops **ops)
>> >> +{
>> >> + struct bypass_master *bypass_master;
>> >> + struct net_device *bypass_netdev;
>> >> +
>> >> + spin_lock(&bypass_lock);
>> >> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
>> >> > > As I wrote the last time, you don't need this list, spinlock.
>> >> > > You can do just something like:
>> >> > > for_each_net(net) {
>> >> > > for_each_netdev(net, dev) {
>> >> > > if (netif_is_bypass_master(dev)) {
>> >> > This function returns the upper netdev as well as the ops associated
>> >> > with that netdev.
>> >> > bypass_master_list is a list of 'struct bypass_master' that associates
>> >> Well, can't you have it in netdev priv?
>> >
>> >We cannot do this for 2-netdev model as there is no bypass_netdev created.
>>
>> Howcome? You have no master? I don't understand..
>>
>>
>>
>> >
>> >>
>> >>
>> >> > 'bypass_netdev' with 'bypass_ops' and gets added via bypass_master_register().
>> >> > We need 'ops' only to support the 2 netdev model of netvsc. ops will be
>> >> > NULL for 3-netdev model.
>> >> I see :(
>> >>
>> >>
>> >> >
>> >> > >
>> >> > >
>> >> > >
>> >> > > > + bypass_netdev = rcu_dereference(bypass_master->bypass_netdev);
>> >> > > > + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> >> > > > + *ops = rcu_dereference(bypass_master->ops);
>> >> > > I don't see how rcu_dereference is ok here.
>> >> > > 1) I don't see rcu_read_lock taken
>> >> > > 2) Looks like bypass_master->ops has the same value across the whole
>> >> > > existence.
>> >> > We hold rtnl_lock(), i think i need to change this to rtnl_dereference.
>> >> > Yes. ops doesn't change.
>> >> If it does not change, you can just access it directly.
>> >>
>> >>
>> >> > >
>> >> > > > + spin_unlock(&bypass_lock);
>> >> > > > + return bypass_netdev;
>> >> > > > + }
>> >> > > > + }
>> >> > > > + spin_unlock(&bypass_lock);
>> >> > > > + return NULL;
>> >> > > > +}
>> >> > > > +
>> >> > > > +static int bypass_slave_register(struct net_device *slave_netdev)
>> >> > > > +{
>> >> > > > + struct net_device *bypass_netdev;
>> >> > > > + struct bypass_ops *bypass_ops;
>> >> > > > + int ret, orig_mtu;
>> >> > > > +
>> >> > > > + ASSERT_RTNL();
>> >> > > > +
>> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> >> > > > + &bypass_ops);
>> >> > > For master, could you use word "master" in the variables so it is clear?
>> >> > > Also, "dev" is fine instead of "netdev".
>> >> > > Something like "bpmaster_dev"
>> >> > bypass_master is of type struct bypass_master, bypass_netdev is of type struct net_device.
>> >> I was trying to point out, that "bypass_netdev" represents a "master"
>> >> netdev, yet it does not say master. That is why I suggested
>> >> "bpmaster_dev"
>> >>
>> >>
>> >> > I can change all _netdev suffixes to _dev to make the names shorter.
>> >> ok.
>> >>
>> >>
>> >> >
>> >> > >
>> >> > > > + if (!bypass_netdev)
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + ret = bypass_slave_pre_register(slave_netdev, bypass_netdev,
>> >> > > > + bypass_ops);
>> >> > > > + if (ret != 0)
>> >> > > Just "if (ret)" will do. You have this on more places.
>> >> > OK.
>> >> >
>> >> >
>> >> > >
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + ret = netdev_rx_handler_register(slave_netdev,
>> >> > > > + bypass_ops ? bypass_ops->handle_frame :
>> >> > > > + bypass_handle_frame, bypass_netdev);
>> >> > > > + if (ret != 0) {
>> >> > > > + netdev_err(slave_netdev, "can not register bypass rx handler (err = %d)\n",
>> >> > > > + ret);
>> >> > > > + goto done;
>> >> > > > + }
>> >> > > > +
>> >> > > > + ret = netdev_upper_dev_link(slave_netdev, bypass_netdev, NULL);
>> >> > > > + if (ret != 0) {
>> >> > > > + netdev_err(slave_netdev, "can not set master device %s (err = %d)\n",
>> >> > > > + bypass_netdev->name, ret);
>> >> > > > + goto upper_link_failed;
>> >> > > > + }
>> >> > > > +
>> >> > > > + slave_netdev->priv_flags |= IFF_BYPASS_SLAVE;
>> >> > > > +
>> >> > > > + if (netif_running(bypass_netdev)) {
>> >> > > > + ret = dev_open(slave_netdev);
>> >> > > > + if (ret && (ret != -EBUSY)) {
>> >> > > > + netdev_err(bypass_netdev, "Opening slave %s failed ret:%d\n",
>> >> > > > + slave_netdev->name, ret);
>> >> > > > + goto err_interface_up;
>> >> > > > + }
>> >> > > > + }
>> >> > > > +
>> >> > > > + /* Align MTU of slave with master */
>> >> > > > + orig_mtu = slave_netdev->mtu;
>> >> > > > + ret = dev_set_mtu(slave_netdev, bypass_netdev->mtu);
>> >> > > > + if (ret != 0) {
>> >> > > > + netdev_err(bypass_netdev, "unable to change mtu of %s to %u register failed\n",
>> >> > > > + slave_netdev->name, bypass_netdev->mtu);
>> >> > > > + goto err_set_mtu;
>> >> > > > + }
>> >> > > > +
>> >> > > > + ret = bypass_slave_join(slave_netdev, bypass_netdev, bypass_ops);
>> >> > > > + if (ret != 0)
>> >> > > > + goto err_join;
>> >> > > > +
>> >> > > > + call_netdevice_notifiers(NETDEV_JOIN, slave_netdev);
>> >> > > > +
>> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s registered\n",
>> >> > > > + slave_netdev->name);
>> >> > > > +
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > +err_join:
>> >> > > > + dev_set_mtu(slave_netdev, orig_mtu);
>> >> > > > +err_set_mtu:
>> >> > > > + dev_close(slave_netdev);
>> >> > > > +err_interface_up:
>> >> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> >> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> >> > > > +upper_link_failed:
>> >> > > > + netdev_rx_handler_unregister(slave_netdev);
>> >> > > > +done:
>> >> > > > + return NOTIFY_DONE;
>> >> > > > +}
>> >> > > > +
>> >> > > > +static int bypass_slave_pre_unregister(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev,
>> >> > > > + struct bypass_ops *bypass_ops)
>> >> > > > +{
>> >> > > > + struct net_device *backup_netdev, *active_netdev;
>> >> > > > + struct bypass_info *bi;
>> >> > > > +
>> >> > > > + if (bypass_ops) {
>> >> > > > + if (!bypass_ops->slave_pre_unregister)
>> >> > > > + return -EINVAL;
>> >> > > > +
>> >> > > > + return bypass_ops->slave_pre_unregister(slave_netdev,
>> >> > > > + bypass_netdev);
>> >> > > > + }
>> >> > > > +
>> >> > > > + bi = netdev_priv(bypass_netdev);
>> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> >> > > > +
>> >> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> >> > > > + return -EINVAL;
>> >> > > > +
>> >> > > > + return 0;
>> >> > > > +}
>> >> > > > +
>> >> > > > +static int bypass_slave_release(struct net_device *slave_netdev,
>> >> > > > + struct net_device *bypass_netdev,
>> >> > > > + struct bypass_ops *bypass_ops)
>> >> > > > +{
>> >> > > > + struct net_device *backup_netdev, *active_netdev;
>> >> > > > + struct bypass_info *bi;
>> >> > > > +
>> >> > > > + if (bypass_ops) {
>> >> > > > + if (!bypass_ops->slave_release)
>> >> > > > + return -EINVAL;
>> >> > > I think it would be good to make the API to the driver more strict and
>> >> > > have a separate set of ops for "active" and "backup" netdevices.
>> >> > > That should stop people thinking about extending this to more slaves in
>> >> > > the future.
>> >> > We have checks in slave_pre_register() that allows only 1 'backup' and 1
>> >> > 'active' slave.
>> >> I'm very well aware of that. I just thought that explicit ops for the
>> >> two slaves would make this more clear.
>> >>
>> >>
>> >> >
>> >> > >
>> >> > >
>> >> > > > +
>> >> > > > + return bypass_ops->slave_release(slave_netdev, bypass_netdev);
>> >> > > > + }
>> >> > > > +
>> >> > > > + bi = netdev_priv(bypass_netdev);
>> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> >> > > > +
>> >> > > > + if (slave_netdev == backup_netdev) {
>> >> > > > + RCU_INIT_POINTER(bi->backup_netdev, NULL);
>> >> > > > + } else {
>> >> > > > + RCU_INIT_POINTER(bi->active_netdev, NULL);
>> >> > > > + if (backup_netdev) {
>> >> > > > + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> >> > > > + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> >> > > > + }
>> >> > > > + }
>> >> > > > +
>> >> > > > + dev_put(slave_netdev);
>> >> > > > +
>> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s released\n",
>> >> > > > + slave_netdev->name);
>> >> > > > +
>> >> > > > + return 0;
>> >> > > > +}
>> >> > > > +
>> >> > > > +int bypass_slave_unregister(struct net_device *slave_netdev)
>> >> > > > +{
>> >> > > > + struct net_device *bypass_netdev;
>> >> > > > + struct bypass_ops *bypass_ops;
>> >> > > > + int ret;
>> >> > > > +
>> >> > > > + if (!netif_is_bypass_slave(slave_netdev))
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + ASSERT_RTNL();
>> >> > > > +
>> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> >> > > > + &bypass_ops);
>> >> > > > + if (!bypass_netdev)
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + ret = bypass_slave_pre_unregister(slave_netdev, bypass_netdev,
>> >> > > > + bypass_ops);
>> >> > > > + if (ret != 0)
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + netdev_rx_handler_unregister(slave_netdev);
>> >> > > > + netdev_upper_dev_unlink(slave_netdev, bypass_netdev);
>> >> > > > + slave_netdev->priv_flags &= ~IFF_BYPASS_SLAVE;
>> >> > > > +
>> >> > > > + bypass_slave_release(slave_netdev, bypass_netdev, bypass_ops);
>> >> > > > +
>> >> > > > + netdev_info(bypass_netdev, "bypass slave:%s unregistered\n",
>> >> > > > + slave_netdev->name);
>> >> > > > +
>> >> > > > +done:
>> >> > > > + return NOTIFY_DONE;
>> >> > > > +}
>> >> > > > +EXPORT_SYMBOL_GPL(bypass_slave_unregister);
>> >> > > > +
>> >> > > > +static bool bypass_xmit_ready(struct net_device *dev)
>> >> > > > +{
>> >> > > > + return netif_running(dev) && netif_carrier_ok(dev);
>> >> > > > +}
>> >> > > > +
>> >> > > > +static int bypass_slave_link_change(struct net_device *slave_netdev)
>> >> > > > +{
>> >> > > > + struct net_device *bypass_netdev, *active_netdev, *backup_netdev;
>> >> > > > + struct bypass_ops *bypass_ops;
>> >> > > > + struct bypass_info *bi;
>> >> > > > +
>> >> > > > + if (!netif_is_bypass_slave(slave_netdev))
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + ASSERT_RTNL();
>> >> > > > +
>> >> > > > + bypass_netdev = bypass_master_get_bymac(slave_netdev->perm_addr,
>> >> > > > + &bypass_ops);
>> >> > > > + if (!bypass_netdev)
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + if (bypass_ops) {
>> >> > > > + if (!bypass_ops->slave_link_change)
>> >> > > > + goto done;
>> >> > > > +
>> >> > > > + return bypass_ops->slave_link_change(slave_netdev,
>> >> > > > + bypass_netdev);
>> >> > > > + }
>> >> > > > +
>> >> > > > + if (!netif_running(bypass_netdev))
>> >> > > > + return 0;
>> >> > > > +
>> >> > > > + bi = netdev_priv(bypass_netdev);
>> >> > > > +
>> >> > > > + active_netdev = rtnl_dereference(bi->active_netdev);
>> >> > > > + backup_netdev = rtnl_dereference(bi->backup_netdev);
>> >> > > > +
>> >> > > > + if (slave_netdev != active_netdev && slave_netdev != backup_netdev)
>> >> > > > + goto done;
>> >> > > You don't need this check. "if (!netif_is_bypass_slave(slave_netdev))"
>> >> > > above is enough.
>> >> > I think we need this check to not allow events from a slave that is not
>> >> > attached to this master but has the same MAC.
>> >> Why do we need such events? Seems wrong to me.
>> >
>> >We want to avoid events from a netdev that is mis-configured with the same MAC as
>> >a bypass setup.
>> >
>> >> Consider:
>> >>
>> >> bp1 bp2
>> >> a1 b1 a2 b2
>> >>
>> >>
>> >> a1 and a2 have the same mac and bp1 and bp2 have the same mac.
>> >
>> >We should not have 2 bypass configs with the same MAC.
>> >I need to add a check in the bypass_master_register() to prevent this.
>>
>> Mac can change, you would have to check in change as well. Feels odd
>> thought.
>>
>>
>> >
>> >The above check is to avoid cases where we have
>> >bp1(a1, b1) with mac1
>> >and a2 is mis-configured with mac1, we want to avoid using a2 link events to update bp1.
>> >
>> >> Now bypass_master_get_bymac() will return always bp1 or bp2 - depending on
>> >> the order of creation.
>> >> Let's say it will return bp1. Then when we have event for a2, the
>> >> bypass_ops->slave_link_change is called with (a2, bp1). That is wrong.
>> >>
>> >>
>> >> You cannot use bypass_master_get_bymac() here.
>> >>
>> >>
>> >>
>> >> > >
>> >> > > > +
>> >> > > > + if ((active_netdev && bypass_xmit_ready(active_netdev)) ||
>> >> > > > + (backup_netdev && bypass_xmit_ready(backup_netdev))) {
>> >> > > > + netif_carrier_on(bypass_netdev);
>> >> > > > + netif_tx_wake_all_queues(bypass_netdev);
>> >> > > > + } else {
>> >> > > > + netif_carrier_off(bypass_netdev);
>> >> > > > + netif_tx_stop_all_queues(bypass_netdev);
>> >> > > > + }
>> >> > > > +
>> >> > > > +done:
>> >> > > > + return NOTIFY_DONE;
>> >> > > > +}
>> >> > > > +
>> >> > > > +static bool bypass_validate_event_dev(struct net_device *dev)
>> >> > > > +{
>> >> > > > + /* Skip parent events */
>> >> > > > + if (netif_is_bypass_master(dev))
>> >> > > > + return false;
>> >> > > > +
>> >> > > > + /* Avoid non-Ethernet type devices */
>> >> > > > + if (dev->type != ARPHRD_ETHER)
>> >> > > > + return false;
>> >> > > > +
>> >> > > > + /* Avoid Vlan dev with same MAC registering as VF */
>> >> > > > + if (is_vlan_dev(dev))
>> >> > > > + return false;
>> >> > > > +
>> >> > > > + /* Avoid Bonding master dev with same MAC registering as slave dev */
>> >> > > > + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> >> > > Yeah, this is certainly incorrect. One thing is, you should be using the
>> >> > > helpers netif_is_bond_master().
>> >> > > But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>> >> > >
>> >> > > You need to do it not by blacklisting, but with whitelisting. You need
>> >> > > to whitelist VF devices. My port flavours patchset might help with this.
>> >> > May be i can use netdev_has_lower_dev() helper to make sure that the slave
>> >> I don't see such function in the code.
>> >
>> >It is netdev_has_any_lower_dev(). I need to export it.
>>
>> Come on, you cannot use that. That would allow bonding without slaves,
>> but the slaves could be added later on.
>>
>> What exactly you are trying to achieve by this?
>>
>>
>> >
>> >>
>> >>
>> >> > device is not an upper dev.
>> >> > Can you point to your port flavours patchset? Is it upstream?
>> >> I sent rfc couple of weeks ago:
>> >> [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
>> >
>> >
>> >
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Samudrala, Sridhar @ 2018-04-18 22:46 UTC (permalink / raw)
To: Jiri Pirko, Michael S. Tsirkin
Cc: alexander.h.duyck, virtio-dev, kubakici, netdev, virtualization,
loseweigh, davem
In-Reply-To: <20180418203206.GC1922@nanopsycho>
On 4/18/2018 1:32 PM, Jiri Pirko wrote:
>>>>>>> You still use "active"/"backup" names which is highly misleading as
>>>>>>> it has completely different meaning that in bond for example.
>>>>>>> I noted that in my previous review already. Please change it.
>>>>>> I guess the issue is with only the 'active' name. 'backup' should be fine as it also
>>>>>> matches with the BACKUP feature bit we are adding to virtio_net.
>>>>> I think that "backup" is also misleading. Both "active" and "backup"
>>>>> mean a *state* of slaves. This should be named differently.
>>>>>
>>>>>
>>>>>
>>>>>> With regards to alternate names for 'active', you suggested 'stolen', but i
>>>>>> am not too happy with it.
>>>>>> netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
>>>>> No. The netdev could be any netdevice. It does not have to be a "VF".
>>>>> I think "stolen" is quite appropriate since it describes the modus
>>>>> operandi. The bypass master steals some netdevice according to some
>>>>> match.
>>>>>
>>>>> But I don't insist on "stolen". Just sounds right.
>>>> We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
>>>> 'backup' name is consistent.
>>> It perhaps makes sense from the view of virtio device. However, as I
>>> described couple of times, for master/slave device the name "backup" is
>>> highly misleading.
>> virtio is the backup. You are supposed to use another
>> (typically passthrough) device, if that fails use virtio.
>> It does seem appropriate to me. If you like, we can
>> change that to "standby". Active I don't like either. "main"?
> Sounds much better, yes.
OK. Will change backup to 'standby'.
'main' is fine, what about 'primary'?
>
>
>> In fact would failover be better than bypass?
> Also, much better.
So do we want to change all 'bypass' references to 'failover' including
the filenames.(net/core/failover.c and include/net/failover.h)
<snip>
>
>
>>
>>>> The intent is to restrict the 'active' netdev to be a VF. If there is a way to check that
>>>> a PCI device is a VF in the guest kernel, we could restrict 'active' netdev to be a VF.
>>>>
>>>> Will look for any suggestions in the next day or two. If i don't get any, i will go
>>>> with 'stolen'
>>>>
>>>> <snip>
>>>>
>>>>
>>>>> +
>>>>> +static struct net_device *bypass_master_get_bymac(u8 *mac,
>>>>> + struct bypass_ops **ops)
>>>>> +{
>>>>> + struct bypass_master *bypass_master;
>>>>> + struct net_device *bypass_netdev;
>>>>> +
>>>>> + spin_lock(&bypass_lock);
>>>>> + list_for_each_entry(bypass_master, &bypass_master_list, list) {
>>>>>>> As I wrote the last time, you don't need this list, spinlock.
>>>>>>> You can do just something like:
>>>>>>> for_each_net(net) {
>>>>>>> for_each_netdev(net, dev) {
>>>>>>> if (netif_is_bypass_master(dev)) {
>>>>>> This function returns the upper netdev as well as the ops associated
>>>>>> with that netdev.
>>>>>> bypass_master_list is a list of 'struct bypass_master' that associates
>>>>> Well, can't you have it in netdev priv?
>>>> We cannot do this for 2-netdev model as there is no bypass_netdev created.
>>> Howcome? You have no master? I don't understand..
For 2-netdev model, the master netdev is not a new one created by the bypass module.
It is created by netvsc internally and passed via bypass_master_register()
<snip>
>>>
>>>>>>>> +
>>>>>>>> + /* Avoid Bonding master dev with same MAC registering as slave dev */
>>>>>>>> + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>>>>>>> Yeah, this is certainly incorrect. One thing is, you should be using the
>>>>>>> helpers netif_is_bond_master().
>>>>>>> But what about the rest? macsec, macvlan, team, bridge, ovs and others?
>>>>>>>
>>>>>>> You need to do it not by blacklisting, but with whitelisting. You need
>>>>>>> to whitelist VF devices. My port flavours patchset might help with this.
>>>>>> May be i can use netdev_has_lower_dev() helper to make sure that the slave
>>>>> I don't see such function in the code.
>>>> It is netdev_has_any_lower_dev(). I need to export it.
>>> Come on, you cannot use that. That would allow bonding without slaves,
>>> but the slaves could be added later on.
>>>
>>> What exactly you are trying to achieve by this?
I think i can remove this check. In pre-register,
for backup device, i check that its parent matches bypass device &
for vf device, we make sure that it is a pci device.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [virtio-dev] Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Samudrala, Sridhar @ 2018-04-18 23:33 UTC (permalink / raw)
To: Siwei Liu, David Miller
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
Jakub Kicinski, Netdev, virtualization, David Ahern, si-wei liu
In-Reply-To: <CADGSJ21nQ1BQctfZcThpmOcOUrpqLfG71jsh2trb1utVjNQH=Q@mail.gmail.com>
On 4/17/2018 5:26 PM, Siwei Liu wrote:
> I ran this with a few folks offline and gathered some good feedbacks
> that I'd like to share thus revive the discussion.
>
> First of all, as illustrated in the reply below, cloud service
> providers require transparent live migration. Specifically, the main
> target of our case is to support SR-IOV live migration via kernel
> upgrade while keeping the userspace of old distros unmodified. If it's
> because this use case is not appealing enough for the mainline to
> adopt, I will shut up and not continue discussing, although
> technically it's entirely possible (and there's precedent in other
> implementation) to do so to benefit any cloud service providers.
>
> If it's just the implementation of hiding netdev itself needs to be
> improved, such as implementing it as attribute flag or adding linkdump
> API, that's completely fine and we can look into that. However, the
> specific issue needs to be undestood beforehand is to make transparent
> SR-IOV to be able to take over the name (so inherit all the configs)
> from the lower netdev, which needs some games with uevents and name
> space reservation. So far I don't think it's been well discussed.
>
> One thing in particular I'd like to point out is that the 3-netdev
> model currently missed to address the core problem of live migration:
> migration of hardware specific feature/state, for e.g. ethtool configs
> and hardware offloading states. Only general network state (IP
> address, gateway, for eg.) associated with the bypass interface can be
> migrated. As a follow-up work, bypass driver can/should be enhanced to
> save and apply those hardware specific configs before or after
> migration as needed. The transparent 1-netdev model being proposed as
> part of this patch series will be able to solve that problem naturally
> by making all hardware specific configurations go through the central
> bypass driver, such that hardware configurations can be replayed when
> new VF or passthrough gets plugged back in. Although that
> corresponding function hasn't been implemented today, I'd like to
> refresh everyone's mind that is the core problem any live migration
> proposal should have addressed.
>
> If it would make things more clear to defer netdev hiding until all
> functionalities regarding centralizing and replay are implemented,
> we'd take advices like that and move on to implementing those features
> as follow-up patches. Once all needed features get done, we'd resume
> the work for hiding lower netdev at that point. Think it would be the
> best to make everyone understand the big picture in advance before
> going too far.
I think we should get the 3-netdev model integrated and add any additional
ndo_ops/ethool ops that we would like to support/migrate before looking into
hiding the lower netdevs.
>
> Thanks, comments welcome.
>
> -Siwei
>
>
> On Mon, Apr 9, 2018 at 11:48 PM, Siwei Liu <loseweigh@gmail.com> wrote:
>> On Sun, Apr 8, 2018 at 9:32 AM, David Miller <davem@davemloft.net> wrote:
>>> From: Siwei Liu <loseweigh@gmail.com>
>>> Date: Fri, 6 Apr 2018 19:32:05 -0700
>>>
>>>> And I assume everyone here understands the use case for live
>>>> migration (in the context of providing cloud service) is very
>>>> different, and we have to hide the netdevs. If not, I'm more than
>>>> happy to clarify.
>>> I think you still need to clarify.
>> OK. The short answer is cloud users really want *transparent* live migration.
>>
>> By being transparent it means they don't and shouldn't care about the
>> existence and the occurence of live migration, but they do if
>> userspace toolstack and libraries have to be updated or modified,
>> which means potential dependency brokeness of their applications. They
>> don't like any change to the userspace envinroment (existing apps
>> lift-and-shift, no recompilation, no re-packaging, no re-certification
>> needed), while no one barely cares about ABI or API compatibility in
>> the kernel level, as long as their applications don't break.
>>
>> I agree the current bypass solution for SR-IOV live migration requires
>> guest cooperation. Though it doesn't mean guest *userspace*
>> cooperation. As a matter of fact, techinically it shouldn't invovle
>> userspace at all to get SR-IOV migration working. It's the kernel that
>> does the real work. If I understand the goal of this in-kernel
>> approach correctly, it was meant to save userspace from modification
>> or corresponding toolstack support, as those additional 2 interfaces
>> is more a side product of this approach, rather than being neccessary
>> for users to be aware of. All what the user needs to deal with is one
>> single interface, and that's what they care about. It's more a trouble
>> than help when they see 2 extra interfaces are present. Management
>> tools in the old distros don't recoginze them and try to bring up
>> those extra interfaces for its own. Various odd warnings start to spew
>> out, and there's a lot of caveats for the users to get around...
>>
>> On the other hand, if we "teach" those cloud users to update the
>> userspace toolstack just for trading a feature they don't need, no one
>> is likely going to embrace the change. As such there's just no real
>> value of adopting this in-kernel bypass facility for any cloud service
>> provider. It does not look more appealing than just configure generic
>> bonding using its own set of daemons or scripts. But again, cloud
>> users don't welcome that facility. And basically it would get to
>> nearly the same set of problems if leaving userspace alone.
>>
>> IMHO we're not hiding the devices, think it the way we're adding a
>> feature transparent to user. Those auto-managed slaves are ones users
>> don't care about much. And user is still able to see and configure the
>> lower netdevs if they really desires to do so. But generally the
>> target user for this feature won't need to know that. Why they care
>> how many interfaces a VM virtually has rather than how many interfaces
>> are actually _useable_ to them??
>>
>> Thanks,
>> -Siwei
>>
>>
>>> netdevs are netdevs. If they have special attributes, mark them as
>>> such and the tools base their actions upon that.
>>>
>>> "Hiding", or changing classes, doesn't make any sense to me still.
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>
^ permalink raw reply
* Re: [RFC PATCH net-next v6 2/4] net: Introduce generic bypass module
From: Michael S. Tsirkin @ 2018-04-19 4:08 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, kubakici, Samudrala, Sridhar,
virtualization, loseweigh, netdev, davem
In-Reply-To: <20180418203206.GC1922@nanopsycho>
On Wed, Apr 18, 2018 at 10:32:06PM +0200, Jiri Pirko wrote:
> >> >> > With regards to alternate names for 'active', you suggested 'stolen', but i
> >> >> > am not too happy with it.
> >> >> > netvsc uses vf_netdev, are you OK with this? Or another option is 'passthru'
> >> >> No. The netdev could be any netdevice. It does not have to be a "VF".
> >> >> I think "stolen" is quite appropriate since it describes the modus
> >> >> operandi. The bypass master steals some netdevice according to some
> >> >> match.
> >> >>
> >> >> But I don't insist on "stolen". Just sounds right.
> >> >
> >> >We are adding VIRTIO_NET_F_BACKUP as a new feature bit to enable this feature, So i think
> >> >'backup' name is consistent.
> >>
> >> It perhaps makes sense from the view of virtio device. However, as I
> >> described couple of times, for master/slave device the name "backup" is
> >> highly misleading.
> >
> >virtio is the backup. You are supposed to use another
> >(typically passthrough) device, if that fails use virtio.
> >It does seem appropriate to me. If you like, we can
> >change that to "standby". Active I don't like either. "main"?
>
> Sounds much better, yes.
Excuse me, which of the versions are better in your eyes?
>
> >
> >In fact would failover be better than bypass?
>
> Also, much better.
>
^ permalink raw reply
* Re: [virtio-dev] Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Michael S. Tsirkin @ 2018-04-19 4:41 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Jakub Kicinski, Netdev,
virtualization, Siwei Liu, David Ahern, si-wei liu, David Miller
In-Reply-To: <1f3af59f-fd64-cc0d-f9eb-668636c52db4@intel.com>
On Wed, Apr 18, 2018 at 04:33:34PM -0700, Samudrala, Sridhar wrote:
> On 4/17/2018 5:26 PM, Siwei Liu wrote:
> > I ran this with a few folks offline and gathered some good feedbacks
> > that I'd like to share thus revive the discussion.
> >
> > First of all, as illustrated in the reply below, cloud service
> > providers require transparent live migration. Specifically, the main
> > target of our case is to support SR-IOV live migration via kernel
> > upgrade while keeping the userspace of old distros unmodified. If it's
> > because this use case is not appealing enough for the mainline to
> > adopt, I will shut up and not continue discussing, although
> > technically it's entirely possible (and there's precedent in other
> > implementation) to do so to benefit any cloud service providers.
> >
> > If it's just the implementation of hiding netdev itself needs to be
> > improved, such as implementing it as attribute flag or adding linkdump
> > API, that's completely fine and we can look into that. However, the
> > specific issue needs to be undestood beforehand is to make transparent
> > SR-IOV to be able to take over the name (so inherit all the configs)
> > from the lower netdev, which needs some games with uevents and name
> > space reservation. So far I don't think it's been well discussed.
> >
> > One thing in particular I'd like to point out is that the 3-netdev
> > model currently missed to address the core problem of live migration:
> > migration of hardware specific feature/state, for e.g. ethtool configs
> > and hardware offloading states. Only general network state (IP
> > address, gateway, for eg.) associated with the bypass interface can be
> > migrated. As a follow-up work, bypass driver can/should be enhanced to
> > save and apply those hardware specific configs before or after
> > migration as needed. The transparent 1-netdev model being proposed as
> > part of this patch series will be able to solve that problem naturally
> > by making all hardware specific configurations go through the central
> > bypass driver, such that hardware configurations can be replayed when
> > new VF or passthrough gets plugged back in. Although that
> > corresponding function hasn't been implemented today, I'd like to
> > refresh everyone's mind that is the core problem any live migration
> > proposal should have addressed.
> >
> > If it would make things more clear to defer netdev hiding until all
> > functionalities regarding centralizing and replay are implemented,
> > we'd take advices like that and move on to implementing those features
> > as follow-up patches. Once all needed features get done, we'd resume
> > the work for hiding lower netdev at that point. Think it would be the
> > best to make everyone understand the big picture in advance before
> > going too far.
>
> I think we should get the 3-netdev model integrated and add any additional
> ndo_ops/ethool ops that we would like to support/migrate before looking into
> hiding the lower netdevs.
Once they are exposed, I don't think we'll be able to hide them -
they will be a kernel ABI.
Do you think everyone needs to hide the SRIOV device?
Or that only some users need this?
--
MST
^ permalink raw reply
* Re: [virtio-dev] Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Samudrala, Sridhar @ 2018-04-19 5:00 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Jakub Kicinski, Netdev,
virtualization, Siwei Liu, David Ahern, si-wei liu, David Miller
In-Reply-To: <20180419072003-mutt-send-email-mst@kernel.org>
On 4/18/2018 9:41 PM, Michael S. Tsirkin wrote:
> On Wed, Apr 18, 2018 at 04:33:34PM -0700, Samudrala, Sridhar wrote:
>> On 4/17/2018 5:26 PM, Siwei Liu wrote:
>>> I ran this with a few folks offline and gathered some good feedbacks
>>> that I'd like to share thus revive the discussion.
>>>
>>> First of all, as illustrated in the reply below, cloud service
>>> providers require transparent live migration. Specifically, the main
>>> target of our case is to support SR-IOV live migration via kernel
>>> upgrade while keeping the userspace of old distros unmodified. If it's
>>> because this use case is not appealing enough for the mainline to
>>> adopt, I will shut up and not continue discussing, although
>>> technically it's entirely possible (and there's precedent in other
>>> implementation) to do so to benefit any cloud service providers.
>>>
>>> If it's just the implementation of hiding netdev itself needs to be
>>> improved, such as implementing it as attribute flag or adding linkdump
>>> API, that's completely fine and we can look into that. However, the
>>> specific issue needs to be undestood beforehand is to make transparent
>>> SR-IOV to be able to take over the name (so inherit all the configs)
>>> from the lower netdev, which needs some games with uevents and name
>>> space reservation. So far I don't think it's been well discussed.
>>>
>>> One thing in particular I'd like to point out is that the 3-netdev
>>> model currently missed to address the core problem of live migration:
>>> migration of hardware specific feature/state, for e.g. ethtool configs
>>> and hardware offloading states. Only general network state (IP
>>> address, gateway, for eg.) associated with the bypass interface can be
>>> migrated. As a follow-up work, bypass driver can/should be enhanced to
>>> save and apply those hardware specific configs before or after
>>> migration as needed. The transparent 1-netdev model being proposed as
>>> part of this patch series will be able to solve that problem naturally
>>> by making all hardware specific configurations go through the central
>>> bypass driver, such that hardware configurations can be replayed when
>>> new VF or passthrough gets plugged back in. Although that
>>> corresponding function hasn't been implemented today, I'd like to
>>> refresh everyone's mind that is the core problem any live migration
>>> proposal should have addressed.
>>>
>>> If it would make things more clear to defer netdev hiding until all
>>> functionalities regarding centralizing and replay are implemented,
>>> we'd take advices like that and move on to implementing those features
>>> as follow-up patches. Once all needed features get done, we'd resume
>>> the work for hiding lower netdev at that point. Think it would be the
>>> best to make everyone understand the big picture in advance before
>>> going too far.
>> I think we should get the 3-netdev model integrated and add any additional
>> ndo_ops/ethool ops that we would like to support/migrate before looking into
>> hiding the lower netdevs.
> Once they are exposed, I don't think we'll be able to hide them -
> they will be a kernel ABI.
>
> Do you think everyone needs to hide the SRIOV device?
> Or that only some users need this?
Hyper-V is currently supporting live migration without hiding the SR-IOV device. So i don't
think it is a hard requirement. And also, as we don't yet have a consensus on how to hide
the lower netdevs, we could make it as another feature bit to hide lower netdevs once
we have an acceptable solution.
^ permalink raw reply
* [PATCH net] virtio_net: split out ctrl buffer
From: Michael S. Tsirkin @ 2018-04-19 5:01 UTC (permalink / raw)
To: linux-kernel
Cc: Eric Dumazet, netdev, virtualization, Mikulas Patocka,
David Miller
When sending control commands, virtio net sets up several buffers for
DMA. The buffers are all part of the net device which means it's
actually allocated by kvmalloc so in theory (on extreme memory pressure)
it's possible to get a vmalloc'ed buffer which on some platforms means
we can't DMA there.
Fix up by moving the DMA buffers out into a separate structure.
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
Lightly tested.
Mikulas, does this address your problem?
drivers/net/virtio_net.c | 68 +++++++++++++++++++++++++++---------------------
1 file changed, 39 insertions(+), 29 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec..82f50e5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -147,6 +147,17 @@ struct receive_queue {
struct xdp_rxq_info xdp_rxq;
};
+/* Control VQ buffers: protected by the rtnl lock */
+struct control_buf {
+ struct virtio_net_ctrl_hdr hdr;
+ virtio_net_ctrl_ack status;
+ struct virtio_net_ctrl_mq mq;
+ u8 promisc;
+ u8 allmulti;
+ u16 vid;
+ u64 offloads;
+};
+
struct virtnet_info {
struct virtio_device *vdev;
struct virtqueue *cvq;
@@ -192,14 +203,7 @@ struct virtnet_info {
struct hlist_node node;
struct hlist_node node_dead;
- /* Control VQ buffers: protected by the rtnl lock */
- struct virtio_net_ctrl_hdr ctrl_hdr;
- virtio_net_ctrl_ack ctrl_status;
- struct virtio_net_ctrl_mq ctrl_mq;
- u8 ctrl_promisc;
- u8 ctrl_allmulti;
- u16 ctrl_vid;
- u64 ctrl_offloads;
+ struct control_buf *ctrl;
/* Ethtool settings */
u8 duplex;
@@ -1454,25 +1458,25 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
/* Caller should know better */
BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
- vi->ctrl_status = ~0;
- vi->ctrl_hdr.class = class;
- vi->ctrl_hdr.cmd = cmd;
+ vi->ctrl->status = ~0;
+ vi->ctrl->hdr.class = class;
+ vi->ctrl->hdr.cmd = cmd;
/* Add header */
- sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
+ sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
sgs[out_num++] = &hdr;
if (out)
sgs[out_num++] = out;
/* Add return status. */
- sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
+ sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
sgs[out_num] = &stat;
BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
if (unlikely(!virtqueue_kick(vi->cvq)))
- return vi->ctrl_status == VIRTIO_NET_OK;
+ return vi->ctrl->status == VIRTIO_NET_OK;
/* Spin for a response, the kick causes an ioport write, trapping
* into the hypervisor, so the request should be handled immediately.
@@ -1481,7 +1485,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
!virtqueue_is_broken(vi->cvq))
cpu_relax();
- return vi->ctrl_status == VIRTIO_NET_OK;
+ return vi->ctrl->status == VIRTIO_NET_OK;
}
static int virtnet_set_mac_address(struct net_device *dev, void *p)
@@ -1593,8 +1597,8 @@ static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0;
- vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
- sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
+ vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
+ sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
@@ -1653,22 +1657,22 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
return;
- vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
- vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
+ vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
+ vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
- sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
+ sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_PROMISC, sg))
dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
- vi->ctrl_promisc ? "en" : "dis");
+ vi->ctrl->promisc ? "en" : "dis");
- sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
+ sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
- vi->ctrl_allmulti ? "en" : "dis");
+ vi->ctrl->allmulti ? "en" : "dis");
uc_count = netdev_uc_count(dev);
mc_count = netdev_mc_count(dev);
@@ -1714,8 +1718,8 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
struct virtnet_info *vi = netdev_priv(dev);
struct scatterlist sg;
- vi->ctrl_vid = vid;
- sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
+ vi->ctrl->vid = vid;
+ sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_ADD, &sg))
@@ -1729,8 +1733,8 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
struct virtnet_info *vi = netdev_priv(dev);
struct scatterlist sg;
- vi->ctrl_vid = vid;
- sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
+ vi->ctrl->vid = vid;
+ sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_DEL, &sg))
@@ -2126,9 +2130,9 @@ static int virtnet_restore_up(struct virtio_device *vdev)
static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
{
struct scatterlist sg;
- vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads);
+ vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
- sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads));
+ sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
@@ -2351,6 +2355,7 @@ static void virtnet_free_queues(struct virtnet_info *vi)
kfree(vi->rq);
kfree(vi->sq);
+ kfree(vi->err_ctrl);
}
static void _free_receive_bufs(struct virtnet_info *vi)
@@ -2543,6 +2548,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
{
int i;
+ vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
+ if (!vi->ctrl)
+ goto err_ctrl;
vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
if (!vi->sq)
goto err_sq;
@@ -2571,6 +2579,8 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
err_rq:
kfree(vi->sq);
err_sq:
+ kfree(vq->ctrl);
+err_ctrl:
return -ENOMEM;
}
--
MST
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox