Netdev List
 help / color / mirror / Atom feed
* Packet corrupts to guest system system from host kernel 4.16.x
From: Алексей Болдырев @ 2018-04-06  0:49 UTC (permalink / raw)
  To: qemu-discuss, netdev

Why, when using the 4.16 kernel on the host system, is there such a strange behavior of virtual machines? What is the problem? He rolled back to 4.9.c - the problem was gone.

tcpdump on router:
tcpdump: listening on vlan-00110013, link-type EN10MB (Ethernet), capture size 262144 bytes
23:59:08.331875 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype ARP (0x0806), length 38: [|ARP]
	0x0000:  0001 0800 0604 0001 5254 0038 cf94 c0a8  ........RT.8....
	0x0010:  b402 0000 0000 0000                      ........
23:59:08.454364 52:54:00:4e:9c:5f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.180.2 tell 192.168.180.1, length 28
23:59:08.454754 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype ARP (0x0806), length 38: [|ARP]
	0x0000:  0001 0800 0604 0002 5254 0038 cf94 c0a8  ........RT.8....
	0x0010:  b402 5254 004e 9c5f                      ..RT.N._
23:59:09.462383 52:54:00:4e:9c:5f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.180.2 tell 192.168.180.1, length 28
23:59:09.463607 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype ARP (0x0806), length 38: [|ARP]
	0x0000:  0001 0800 0604 0002 5254 0038 cf94 c0a8  ........RT.8....
	0x0010:  b402 5254 004e 9c5f                      ..RT.N._
23:59:10.486352 52:54:00:4e:9c:5f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.180.2 tell 192.168.180.1, length 28
23:59:10.487303 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype ARP (0x0806), length 38: [|ARP]
	0x0000:  0001 0800 0604 0002 5254 0038 cf94 c0a8  ........RT.8....
	0x0010:  b402 5254 004e 9c5f                      ..RT.N._
23:59:11.051570 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype IPv4 (0x0800), length 77: truncated-ip - 4 bytes missing! (tos 0x0, ttl 64, id 54539, offset 0, flags [DF], proto UDP (17), length 67)
    192.168.180.2.59917 > 198.18.120.1.53: 1196+[|domain]
23:59:11.051585 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype IPv4 (0x0800), length 77: truncated-ip - 4 bytes missing! (tos 0x0, ttl 64, id 54540, offset 0, flags [DF], proto UDP (17), length 67)
    192.168.180.2.59917 > 198.18.120.1.53: 5849+[|domain]
23:59:11.527109 52:54:00:4e:9c:5f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.180.2 tell 192.168.180.1, length 28
23:59:11.527707 52:54:00:38:cf:94 > 52:54:00:4e:9c:5f, ethertype ARP (0x0806), length 38: [|ARP]
	0x0000:  0001 0800 0604 0002 5254 0038 cf94 c0a8  ........RT.8....
	0x0010:  b402 5254 004e 9c5f                      ..RT.N._
^C
11 packets captured
11 packets received by filter
0 packets dropped by kernel

^ permalink raw reply

* [PATCH net] net/sched: fix NULL dereference in the error path of tcf_bpf_init()
From: Davide Caratti @ 2018-04-05 23:19 UTC (permalink / raw)
  To: David S. Miller; +Cc: Jamal Hadi Salim, Cong Wang, netdev, Lucas Bates

when tcf_bpf_init_from_ops() fails (e.g. because of program having invalid
number of instructions), tcf_bpf_cfg_cleanup() calls bpf_prog_put(NULL) or
bpf_prog_destroy(NULL). Unless CONFIG_BPF_SYSCALL is unset, this causes
the following error:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000020
 PGD 800000007345a067 P4D 800000007345a067 PUD 340e1067 PMD 0
 Oops: 0000 [#1] SMP PTI
 Modules linked in: act_bpf(E) ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache jbd2 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_codec_generic pcbc snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd glue_helper cryptd joydev snd_timer snd virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_blk drm virtio_net virtio_console i2c_core crc32c_intel serio_raw virtio_pci ata_piix libata virtio_ring floppy virtio dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_bpf]
 CPU: 3 PID: 5654 Comm: tc Tainted: G            E    4.16.0.bpf_test+ #408
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:__bpf_prog_put+0xc/0xc0
 RSP: 0018:ffff9594003ef728 EFLAGS: 00010202
 RAX: 0000000000000000 RBX: ffff9594003ef758 RCX: 0000000000000024
 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000044
 R10: 0000000000000220 R11: ffff8a7ab9f17131 R12: 0000000000000000
 R13: ffff8a7ab7c3c8e0 R14: 0000000000000001 R15: ffff8a7ab88f1054
 FS:  00007fcb2f17c740(0000) GS:ffff8a7abfd80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000020 CR3: 000000007c888006 CR4: 00000000001606e0
 Call Trace:
  tcf_bpf_cfg_cleanup+0x2f/0x40 [act_bpf]
  tcf_bpf_cleanup+0x4c/0x70 [act_bpf]
  __tcf_idr_release+0x79/0x140
  tcf_bpf_init+0x125/0x330 [act_bpf]
  tcf_action_init_1+0x2cc/0x430
  ? get_page_from_freelist+0x3f0/0x11b0
  tcf_action_init+0xd3/0x1b0
  tc_ctl_action+0x18b/0x240
  rtnetlink_rcv_msg+0x29c/0x310
  ? _cond_resched+0x15/0x30
  ? __kmalloc_node_track_caller+0x1b9/0x270
  ? rtnl_calcit.isra.29+0x100/0x100
  netlink_rcv_skb+0xd2/0x110
  netlink_unicast+0x17c/0x230
  netlink_sendmsg+0x2cd/0x3c0
  sock_sendmsg+0x30/0x40
  ___sys_sendmsg+0x27a/0x290
  ? mem_cgroup_commit_charge+0x80/0x130
  ? page_add_new_anon_rmap+0x73/0xc0
  ? do_anonymous_page+0x2a2/0x560
  ? __handle_mm_fault+0xc75/0xe20
  __sys_sendmsg+0x58/0xa0
  do_syscall_64+0x6e/0x1a0
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
 RIP: 0033:0x7fcb2e58eba0
 RSP: 002b:00007ffc93c496c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007ffc93c497f0 RCX: 00007fcb2e58eba0
 RDX: 0000000000000000 RSI: 00007ffc93c49740 RDI: 0000000000000003
 RBP: 000000005ac6a646 R08: 0000000000000002 R09: 0000000000000000
 R10: 00007ffc93c49120 R11: 0000000000000246 R12: 0000000000000000
 R13: 00007ffc93c49804 R14: 0000000000000001 R15: 000000000066afa0
 Code: 5f 00 48 8b 43 20 48 c7 c7 70 2f 7c b8 c7 40 10 00 00 00 00 5b e9 a5 8b 61 00 0f 1f 44 00 00 0f 1f 44 00 00 41 54 55 48 89 fd 53 <48> 8b 47 20 f0 ff 08 74 05 5b 5d 41 5c c3 41 89 f4 0f 1f 44 00
 RIP: __bpf_prog_put+0xc/0xc0 RSP: ffff9594003ef728
 CR2: 0000000000000020

Fix it in tcf_bpf_cfg_cleanup(), ensuring that bpf_prog_{put,destroy}(f)
is called only when f is not NULL.

Fixes: bbc09e7842a5 ("net/sched: fix idr leak on the error path of tcf_bpf_init()")
Reported-by: Lucas Bates <lucasb@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_bpf.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 9092531d45d8..18089c02e557 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -248,10 +248,14 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
 
 static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg)
 {
-	if (cfg->is_ebpf)
-		bpf_prog_put(cfg->filter);
-	else
-		bpf_prog_destroy(cfg->filter);
+	struct bpf_prog *filter = cfg->filter;
+
+	if (filter) {
+		if (cfg->is_ebpf)
+			bpf_prog_put(filter);
+		else
+			bpf_prog_destroy(filter);
+	}
 
 	kfree(cfg->bpf_ops);
 	kfree(cfg->bpf_name);
-- 
2.14.3

^ permalink raw reply related

* Re: Enable and configure storm prevention in a network device
From: Florian Fainelli @ 2018-04-05 22:35 UTC (permalink / raw)
  To: David Miller, m-karicheri2; +Cc: netdev, andrew
In-Reply-To: <20180405.162031.2009953983418308744.davem@davemloft.net>

On 04/05/2018 01:20 PM, David Miller wrote:
> From: Murali Karicheri <m-karicheri2@ti.com>
> Date: Thu, 5 Apr 2018 16:14:49 -0400
> 
>> Is there a standard way to implement and configure storm prevention
>> in a Linux network device?
> 
> What kind of "storm", an interrupt storm?
> 

I would assume Murali is referring to L2 broadcast storms which is
common in switches. There is not an API for that AFAICT and I am not
sure what a proper API would look like.
-- 
Florian

^ permalink raw reply

* Re: [PATCH net] net: dsa: Discard frames from unused ports
From: Florian Fainelli @ 2018-04-05 22:20 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: David Miller, netdev, Vivien Didelot
In-Reply-To: <20180405021757.GA1838@lunn.ch>

On 04/04/2018 07:17 PM, Andrew Lunn wrote:
> On Wed, Apr 04, 2018 at 05:49:10PM -0700, Florian Fainelli wrote:
>> On 04/04/2018 04:56 PM, Andrew Lunn wrote:
>>> The Marvell switches under some conditions will pass a frame to the
>>> host with the port being the CPU port. Such frames are invalid, and
>>> should be dropped. Not dropping them can result in a crash when
>>> incrementing the receive statistics for an invalid port.
>>>
>>> Reported-by: Chris Healy <cphealy@gmail.com>
>>> Fixes: 5f6b4e14cada ("net: dsa: User per-cpu 64-bit statistics")
>>
>> Are you sure this is the commit that introduced the problem?
> 
> Hi Florian
> 
> Well, the problem is it crashes when trying to update the
> statistics. The CPU port is not allocated a p->stats64, only slave
> ports get those. So before this patch, there was no crash and the
> frame would be delivered to the master interface. This in itself is
> probably not correct, but also not fatal. Talking to Chris, it seems
> this behaviour has existing for a long while. I needed to use lldpd to
> trigger the issue, because i assume the Marvell switch sees these as
> special frames and forwards them to the CPU. The other thing is, the
> code got refactored recently. So this fix will not rebase to too many
> earlier versions. It needs a fix per tagging protocol for before the
> common dsa_master_find_slave() was added.

Yes what you are explaining makes sense, but does not that mean we would
just be accessing a garbage memory location before as well? It seems to
me like this problem dates back from long before that particular
changeset, and this changeset just made it more obvious. Would that be
accurate?

> 
>>> -	return ds->ports[port].slave;
>>> +	slave_port = &ds->ports[port];
>>> +
>>> +	if (slave_port->type != DSA_PORT_TYPE_USER)
>>
>> Can we optimize this with an unlikely()?
> 
> Yes, that would make sense.
> 
>      Andrew
> 


-- 
Florian

^ permalink raw reply

* Re: Best userspace programming API for XDP features query to kernel?
From: Jakub Kicinski via iovisor-dev @ 2018-04-05 21:47 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: oisf-devel-ZwoEplunGu2j570ONfqVQLVmwVP6tfMwSoIsB4E12gc,
	Alexei Starovoitov, Victor Julien, netdev-u79uwXL29TY76Z2rM5mHXA,
	iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org,
	Peter Manev, Jiri Benc, Saeed Mahameed, Eric Leblond,
	Daniel Borkmann
In-Reply-To: <20180405225133.18a09883-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Thu, 5 Apr 2018 22:51:33 +0200, Jesper Dangaard Brouer wrote:
> > What about nfp in terms of XDP
> > offload capabilities, should they be included as well or is probing to load
> > the program and see if it loads/JITs as we do today just fine (e.g. you'd
> > otherwise end up with extra flags on a per BPF helper basis)?  
> 
> No, flags per BPF helper basis. As I've described above, helper belong
> to the BPF core, not the driver.  Here I want to know what the specific
> driver support.

I think Daniel meant for nfp offload.  The offload restrictions are
quite involved, are we going to be able to express those?

This is a bit simpler but reminds me of the TC flower capability
discussion.  Expressing features and capabilities gets messy quickly.

I have a gut feeling that a good starting point would be defining and
building a test suite or a set of probing tests to check things work at
system level (incl. redirects to different ports etc.)  I think having
a concrete set of litmus tests that confirm the meaning of a given
feature/capability would go a long way in making people more comfortable
with accepting any form of BPF driver capability.  And serious BPF
projects already do probing so it's just centralizing this in the
kernel.

That's my two cents.

^ permalink raw reply

* [PATCH net] net/ipv6: Increment OUTxxx counters after netfilter hook
From: Jeff Barnhill @ 2018-04-05 21:29 UTC (permalink / raw)
  To: netdev; +Cc: dsahern, Jeff Barnhill

At the end of ip6_forward(), IPSTATS_MIB_OUTFORWDATAGRAMS and
IPSTATS_MIB_OUTOCTETS are incremented immediately before the NF_HOOK call
for NFPROTO_IPV6 / NF_INET_FORWARD.  As a result, these counters get
incremented regardless of whether or not the netfilter hook allows the
packet to continue being processed.  This change increments the counters
in ip6_forward_finish() so that it will not happen if the netfilter hook
chooses to terminate the packet, which is similar to how IPv4 works.

Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com>
---
 net/ipv6/ip6_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b8ee50e94af3..2e891d2c30ef 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -375,6 +375,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 				     struct sk_buff *skb)
 {
+	struct dst_entry *dst = skb_dst(skb);
+
+	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+
 	return dst_output(net, sk, skb);
 }
 
@@ -569,8 +574,6 @@ int ip6_forward(struct sk_buff *skb)
 
 	hdr->hop_limit--;
 
-	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
-	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 		       net, NULL, skb, skb->dev, dst->dev,
 		       ip6_forward_finish);
-- 
2.14.1

^ permalink raw reply related

* Re: [patch net] devlink: convert occ_get op to separate registration
From: Jiri Pirko @ 2018-04-05 21:17 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, davem, idosch, jakub.kicinski, mlxsw
In-Reply-To: <0880af20-f0a2-bc19-6b1d-0e42a7937184@gmail.com>

Thu, Apr 05, 2018 at 10:55:58PM CEST, dsahern@gmail.com wrote:
>On 4/5/18 2:13 PM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> This resolves race during initialization where the resources with
>> ops are registered before driver and the structures used by occ_get
>> op is initialized. So keep occ_get callbacks registered only when
>> all structs are initialized.
>> 
>> The example flows, as it is in mlxsw:
>> 1) driver load/asic probe:
>>    mlxsw_core
>>       -> mlxsw_sp_resources_register
>>         -> mlxsw_sp_kvdl_resources_register
>>           -> devlink_resource_register IDX
>>    mlxsw_spectrum
>>       -> mlxsw_sp_kvdl_init
>>         -> mlxsw_sp_kvdl_parts_init
>>           -> mlxsw_sp_kvdl_part_init
>>             -> devlink_resource_size_get IDX (to get the current setup
>>                                               size from devlink)
>>         -> devlink_resource_occ_get_register IDX (register current
>>                                                   occupancy getter)
>> 2) reload triggered by devlink command:
>>   -> mlxsw_devlink_core_bus_device_reload
>>     -> mlxsw_sp_fini
>>       -> mlxsw_sp_kvdl_fini
>> 	-> devlink_resource_occ_get_unregister IDX
>>     (struct mlxsw_sp *mlxsw_sp is freed at this point, call to occ get
>>      which is using mlxsw_sp would cause use-after free)
>>     -> mlxsw_sp_init
>>       -> mlxsw_sp_kvdl_init
>>         -> mlxsw_sp_kvdl_parts_init
>>           -> mlxsw_sp_kvdl_part_init
>>             -> devlink_resource_size_get IDX (to get the current setup
>>                                               size from devlink)
>>         -> devlink_resource_occ_get_register IDX (register current
>>                                                   occupancy getter)
>> 
>> Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>
>
>Why won't something like the attached work as opposed to playing
>register / unregister games?

Because it is quite driver-specific. For example in netdevsim, with
current implementation you don't have to unreg as the priv is there the
whole time.

Also, I think it is more correct to register getter with the priv
directly. Not to depend on getting with via devlink struct somehow.

I'm not saying that my solution is super nice. But what you suggest
seems much more uglier to me.



>diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
>index 93ea56620a24..dcded613faa6 100644
>--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
>+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
>@@ -113,6 +113,7 @@ struct mlxsw_core {
> 	struct mlxsw_thermal *thermal;
> 	struct mlxsw_core_port *ports;
> 	unsigned int max_ports;
>+	bool reload_in_progress;
> 	bool reload_fail;
> 	unsigned long driver_priv[0];
> 	/* driver_priv has to be always the last item */
>@@ -154,6 +155,12 @@ void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
> }
> EXPORT_SYMBOL(mlxsw_core_driver_priv);
> 
>+bool mlxsw_core_reload_in_progress(struct mlxsw_core *mlxsw_core)
>+{
>+	return mlxsw_core->mlxsw_core_driver_priv;
>+}
>+EXPORT_SYMBOL(mlxsw_core_reload_in_progress);
>+
> struct mlxsw_rx_listener_item {
> 	struct list_head list;
> 	struct mlxsw_rx_listener rxl;
>@@ -972,12 +979,16 @@ static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
> 	if (!mlxsw_bus->reset)
> 		return -EOPNOTSUPP;
> 
>+	mlxsw_core->reload_in_progress = true;
>+
> 	mlxsw_core_bus_device_unregister(mlxsw_core, true);
> 	mlxsw_bus->reset(mlxsw_core->bus_priv);
> 	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
> 					     mlxsw_core->bus,
> 					     mlxsw_core->bus_priv, true,
> 					     devlink);
>+	mlxsw_core->reload_in_progress = false;
>+
> 	if (err)
> 		mlxsw_core->reload_fail = true;
> 	return err;
>diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
>index 092d39399f3c..ffa1db154757 100644
>--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
>+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
>@@ -60,6 +60,7 @@ struct mlxsw_bus_info;
> unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core);
> 
> void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core);
>+bool mlxsw_core_reload_in_progress(struct mlxsw_core *mlxsw_core);
> 
> int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver);
> void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
>diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>index 53fffd09d133..09b89af37d8a 100644
>--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>@@ -3808,8 +3808,12 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
> static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
> {
> 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
>+	struct mlxsw_sp *mlxsw_sp;
>+
>+	if (mlxsw_core_reload_in_progress(mlxsw_core))
>+		return 0;
> 
>+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
> }
> 
>diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>index 8796db44dcc3..dd66285bafb5 100644
>--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>@@ -329,9 +329,13 @@ u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
> static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
> {
> 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	struct mlxsw_sp_kvdl_part *part;
>+	struct mlxsw_sp *mlxsw_sp;
> 
>+	if (mlxsw_core_reload_in_progress(mlxsw_core))
>+		return 0;
>+
>+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE];
> 	return mlxsw_sp_kvdl_part_occ(part);
> }
>@@ -339,8 +343,13 @@ static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
> static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
> {
> 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	struct mlxsw_sp_kvdl_part *part;
>+	struct mlxsw_sp *mlxsw_sp;
>+
>+	if (mlxsw_core_reload_in_progress(mlxsw_core))
>+		return 0;
>+
>+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 
> 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS];
> 	return mlxsw_sp_kvdl_part_occ(part);
>@@ -349,9 +358,13 @@ static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
> static u64 mlxsw_sp_kvdl_large_chunks_occ_get(struct devlink *devlink)
> {
> 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	struct mlxsw_sp_kvdl_part *part;
>+	struct mlxsw_sp *mlxsw_sp;
>+
>+	if (mlxsw_core_reload_in_progress(mlxsw_core))
>+		return 0;
> 
>+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
> 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS];
> 	return mlxsw_sp_kvdl_part_occ(part);
> }

^ permalink raw reply

* [RFC PATCH net-next v5 4/4] netvsc: refactor notifier/event handling code to use the bypass framework
From: Sridhar Samudrala @ 2018-04-05 21:08 UTC (permalink / raw)
  To: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
	jasowang, loseweigh, jiri
In-Reply-To: <1522962503-3963-1-git-send-email-sridhar.samudrala@intel.com>

Use the registration/notification framework supported by the generic
bypass infrastructure.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/hyperv/Kconfig      |   1 +
 drivers/net/hyperv/netvsc_drv.c | 219 ++++++++++++----------------------------
 2 files changed, 63 insertions(+), 157 deletions(-)

diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 936968d23559..cc3a721baa18 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -1,5 +1,6 @@
 config HYPERV_NET
 	tristate "Microsoft Hyper-V virtual network driver"
 	depends on HYPERV
+	depends on MAY_USE_BYPASS
 	help
 	  Select this option to enable the Hyper-V virtual network driver.
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ecc84954c511..b53a2be99bd2 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -43,6 +43,7 @@
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
+#include <net/bypass.h>
 
 #include "hyperv_net.h"
 
@@ -1763,46 +1764,6 @@ static void netvsc_link_change(struct work_struct *w)
 	rtnl_unlock();
 }
 
-static struct net_device *get_netvsc_bymac(const u8 *mac)
-{
-	struct net_device *dev;
-
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
-
-		if (ether_addr_equal(mac, dev->perm_addr))
-			return dev;
-	}
-
-	return NULL;
-}
-
-static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
-{
-	struct net_device *dev;
-
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		struct net_device_context *net_device_ctx;
-
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
-
-		net_device_ctx = netdev_priv(dev);
-		if (!rtnl_dereference(net_device_ctx->nvdev))
-			continue;	/* device is removed */
-
-		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
-			return dev;	/* a match */
-	}
-
-	return NULL;
-}
-
 /* Called when VF is injecting data into network stack.
  * Change the associated network device from VF to netvsc.
  * note: already called with rcu_read_lock
@@ -1825,43 +1786,19 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
 	return RX_HANDLER_ANOTHER;
 }
 
-static int netvsc_vf_join(struct net_device *vf_netdev,
-			  struct net_device *ndev)
+static int netvsc_vf_join(struct net_device *ndev,
+			  struct net_device *vf_netdev)
 {
 	struct net_device_context *ndev_ctx = netdev_priv(ndev);
-	int ret;
-
-	ret = netdev_rx_handler_register(vf_netdev,
-					 netvsc_vf_handle_frame, ndev);
-	if (ret != 0) {
-		netdev_err(vf_netdev,
-			   "can not register netvsc VF receive handler (err = %d)\n",
-			   ret);
-		goto rx_handler_failed;
-	}
-
-	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
-	if (ret != 0) {
-		netdev_err(vf_netdev,
-			   "can not set master device %s (err = %d)\n",
-			   ndev->name, ret);
-		goto upper_link_failed;
-	}
-
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
 
 	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
 
-	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
-
 	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
-	return 0;
 
-upper_link_failed:
-	netdev_rx_handler_unregister(vf_netdev);
-rx_handler_failed:
-	return ret;
+	dev_hold(vf_netdev);
+	rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
+
+	return 0;
 }
 
 static void __netvsc_vf_setup(struct net_device *ndev,
@@ -1914,85 +1851,84 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_register_vf(struct net_device *ndev,
+			      struct net_device *vf_netdev)
 {
-	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
 
-	if (vf_netdev->addr_len != ETH_ALEN)
-		return NOTIFY_DONE;
-
-	/*
-	 * We will use the MAC address to locate the synthetic interface to
-	 * associate with the VF interface. If we don't find a matching
-	 * synthetic interface, move on.
-	 */
-	ndev = get_netvsc_bymac(vf_netdev->perm_addr);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
-		return NOTIFY_DONE;
-
-	if (netvsc_vf_join(vf_netdev, ndev) != 0)
-		return NOTIFY_DONE;
+		return -EEXIST;
 
 	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
-	dev_hold(vf_netdev);
-	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
-	return NOTIFY_OK;
+	return 0;
 }
 
 /* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+static int netvsc_vf_changed(struct net_device *ndev,
+			     struct net_device *vf_netdev)
 {
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
-	struct net_device *ndev;
 	bool vf_is_up = netif_running(vf_netdev);
 
-	ndev = get_netvsc_byref(vf_netdev);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev)
-		return NOTIFY_DONE;
+		return -EINVAL;
 
 	netvsc_switch_datapath(ndev, vf_is_up);
 	netdev_info(ndev, "Data path switched %s VF: %s\n",
 		    vf_is_up ? "to" : "from", vf_netdev->name);
 
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int netvsc_unregister_vf(struct net_device *vf_netdev)
+static int netvsc_release_vf(struct net_device *ndev,
+			     struct net_device *vf_netdev)
 {
-	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 
-	ndev = get_netvsc_byref(vf_netdev);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
+	if (vf_netdev != rtnl_dereference(net_device_ctx->vf_netdev))
+		return -EINVAL;
+
 	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
 
+	return 0;
+}
+
+static int netvsc_unregister_vf(struct net_device *ndev,
+				struct net_device *vf_netdev)
+{
+	struct net_device_context *net_device_ctx;
+
+	net_device_ctx = netdev_priv(ndev);
+	if (vf_netdev != rtnl_dereference(net_device_ctx->vf_netdev))
+		return -EINVAL;
+
 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
-	netdev_rx_handler_unregister(vf_netdev);
-	netdev_upper_dev_unlink(vf_netdev, ndev);
 	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
 	dev_put(vf_netdev);
 
-	return NOTIFY_OK;
+	return 0;
 }
 
+static const struct bypass_ops netvsc_bypass_ops = {
+	.register_child		= netvsc_register_vf,
+	.join_child		= netvsc_vf_join,
+	.unregister_child	= netvsc_unregister_vf,
+	.release_child		= netvsc_release_vf,
+	.update_link		= netvsc_vf_changed,
+	.handle_frame		= netvsc_vf_handle_frame,
+};
+
+static struct bypass *netvsc_bypass;
+
 static int netvsc_probe(struct hv_device *dev,
 			const struct hv_vmbus_device_id *dev_id)
 {
@@ -2082,8 +2018,14 @@ static int netvsc_probe(struct hv_device *dev,
 		goto register_failed;
 	}
 
+	ret = bypass_register_instance(netvsc_bypass, net);
+	if (ret != 0)
+		goto err_bypass;
+
 	return ret;
 
+err_bypass:
+	unregister_netdev(net);
 register_failed:
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
@@ -2124,13 +2066,15 @@ static int netvsc_remove(struct hv_device *dev)
 	rtnl_lock();
 	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 	if (vf_netdev)
-		netvsc_unregister_vf(vf_netdev);
+		bypass_unregister_child(vf_netdev);
 
 	if (nvdev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
 
+	bypass_unregister_instance(netvsc_bypass, net);
+
 	rtnl_unlock();
 	rcu_read_unlock();
 
@@ -2157,61 +2101,21 @@ static struct  hv_driver netvsc_drv = {
 	.remove = netvsc_remove,
 };
 
-/*
- * On Hyper-V, every VF interface is matched with a corresponding
- * synthetic interface. The synthetic interface is presented first
- * to the guest. When the corresponding VF instance is registered,
- * we will take care of switching the data path.
- */
-static int netvsc_netdev_event(struct notifier_block *this,
-			       unsigned long event, void *ptr)
-{
-	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
-
-	/* Skip our own events */
-	if (event_dev->netdev_ops == &device_ops)
-		return NOTIFY_DONE;
-
-	/* Avoid non-Ethernet type devices */
-	if (event_dev->type != ARPHRD_ETHER)
-		return NOTIFY_DONE;
-
-	/* Avoid Vlan dev with same MAC registering as VF */
-	if (is_vlan_dev(event_dev))
-		return NOTIFY_DONE;
-
-	/* Avoid Bonding master dev with same MAC registering as VF */
-	if ((event_dev->priv_flags & IFF_BONDING) &&
-	    (event_dev->flags & IFF_MASTER))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		return netvsc_register_vf(event_dev);
-	case NETDEV_UNREGISTER:
-		return netvsc_unregister_vf(event_dev);
-	case NETDEV_UP:
-	case NETDEV_DOWN:
-		return netvsc_vf_changed(event_dev);
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block netvsc_netdev_notifier = {
-	.notifier_call = netvsc_netdev_event,
-};
-
 static void __exit netvsc_drv_exit(void)
 {
-	unregister_netdevice_notifier(&netvsc_netdev_notifier);
 	vmbus_driver_unregister(&netvsc_drv);
+	bypass_unregister_driver(netvsc_bypass);
 }
 
 static int __init netvsc_drv_init(void)
 {
 	int ret;
 
+	netvsc_bypass = bypass_register_driver(&netvsc_bypass_ops,
+					       &device_ops);
+	if (!netvsc_bypass)
+		return -ENOMEM;
+
 	if (ring_size < RING_SIZE_MIN) {
 		ring_size = RING_SIZE_MIN;
 		pr_info("Increased ring_size to %u (min allowed)\n",
@@ -2221,10 +2125,11 @@ static int __init netvsc_drv_init(void)
 	netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
 
 	ret = vmbus_driver_register(&netvsc_drv);
-	if (ret)
+	if (ret) {
+		bypass_unregister_driver(netvsc_bypass);
 		return ret;
+	}
 
-	register_netdevice_notifier(&netvsc_netdev_notifier);
 	return 0;
 }
 
-- 
2.14.3

^ permalink raw reply related

* [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Sridhar Samudrala @ 2018-04-05 21:08 UTC (permalink / raw)
  To: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
	jasowang, loseweigh, jiri
In-Reply-To: <1522962503-3963-1-git-send-email-sridhar.samudrala@intel.com>

This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to enable only one datapath at any time so that
packets don't get looped back to the VM over the other datapath. When a VF
is plugged, the virtio datapath link state can be marked as down. The
hypervisor needs to unplug the VF device from the guest on the source host
and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

When BACKUP feature is enabled, an additional netdev(bypass netdev) is
created that acts as a master device and tracks the state of the 2 lower
netdevs. The original virtio_net netdev is marked as 'backup' netdev and a
passthru device with the same MAC is registered as 'active' netdev.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/Kconfig      |   1 +
 drivers/net/virtio_net.c | 612 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 612 insertions(+), 1 deletion(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 891846655000..9e2cf61fd1c1 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -331,6 +331,7 @@ config VETH
 config VIRTIO_NET
 	tristate "Virtio network driver"
 	depends on VIRTIO
+	depends on MAY_USE_BYPASS
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
 	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index befb5944f3fd..86b2f8f2947d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,8 +30,11 @@
 #include <linux/cpu.h>
 #include <linux/average.h>
 #include <linux/filter.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
 #include <net/route.h>
 #include <net/xdp.h>
+#include <net/bypass.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -206,6 +209,9 @@ struct virtnet_info {
 	u32 speed;
 
 	unsigned long guest_offloads;
+
+	/* upper netdev created when BACKUP feature enabled */
+	struct net_device __rcu *bypass_netdev;
 };
 
 struct padded_vnet_hdr {
@@ -2275,6 +2281,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
+				      size_t len)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int ret;
+
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
+		return -EOPNOTSUPP;
+
+	ret = snprintf(buf, len, "_bkup");
+	if (ret >= len)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -2292,6 +2314,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
 	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
+	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -2689,6 +2712,576 @@ static int virtnet_validate(struct virtio_device *vdev)
 	return 0;
 }
 
+/* START of functions supporting VIRTIO_NET_F_BACKUP feature.
+ * When BACKUP feature is enabled, an additional netdev(bypass netdev)
+ * is created that acts as a master device and tracks the state of the
+ * 2 lower netdevs. The original virtio_net netdev is registered as
+ * 'backup' netdev and a passthru device with the same MAC is registered
+ * as 'active' netdev.
+ */
+
+/* bypass state maintained when BACKUP feature is enabled */
+struct virtnet_bypass_info {
+	/* passthru netdev with same MAC */
+	struct net_device __rcu *active_netdev;
+
+	/* virtio_net netdev */
+	struct net_device __rcu *backup_netdev;
+
+	/* active netdev stats */
+	struct rtnl_link_stats64 active_stats;
+
+	/* backup netdev stats */
+	struct rtnl_link_stats64 backup_stats;
+
+	/* aggregated stats */
+	struct rtnl_link_stats64 bypass_stats;
+
+	/* spinlock while updating stats */
+	spinlock_t stats_lock;
+};
+
+static int virtnet_bypass_open(struct net_device *dev)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	struct net_device *active_netdev, *backup_netdev;
+	int err;
+
+	netif_carrier_off(dev);
+	netif_tx_wake_all_queues(dev);
+
+	active_netdev = rtnl_dereference(vbi->active_netdev);
+	if (active_netdev) {
+		err = dev_open(active_netdev);
+		if (err)
+			goto err_active_open;
+	}
+
+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
+	if (backup_netdev) {
+		err = dev_open(backup_netdev);
+		if (err)
+			goto err_backup_open;
+	}
+
+	return 0;
+
+err_backup_open:
+	dev_close(active_netdev);
+err_active_open:
+	netif_tx_disable(dev);
+	return err;
+}
+
+static int virtnet_bypass_close(struct net_device *dev)
+{
+	struct virtnet_bypass_info *vi = netdev_priv(dev);
+	struct net_device *child_netdev;
+
+	netif_tx_disable(dev);
+
+	child_netdev = rtnl_dereference(vi->active_netdev);
+	if (child_netdev)
+		dev_close(child_netdev);
+
+	child_netdev = rtnl_dereference(vi->backup_netdev);
+	if (child_netdev)
+		dev_close(child_netdev);
+
+	return 0;
+}
+
+static netdev_tx_t virtnet_bypass_drop_xmit(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	atomic_long_inc(&dev->tx_dropped);
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static bool virtnet_bypass_xmit_ready(struct net_device *dev)
+{
+	return netif_running(dev) && netif_carrier_ok(dev);
+}
+
+static netdev_tx_t virtnet_bypass_start_xmit(struct sk_buff *skb,
+					     struct net_device *dev)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	struct net_device *xmit_dev;
+
+	/* Try xmit via active netdev followed by backup netdev */
+	xmit_dev = rcu_dereference_bh(vbi->active_netdev);
+	if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev)) {
+		xmit_dev = rcu_dereference_bh(vbi->backup_netdev);
+		if (!xmit_dev || !virtnet_bypass_xmit_ready(xmit_dev))
+			return virtnet_bypass_drop_xmit(skb, dev);
+	}
+
+	skb->dev = xmit_dev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	return dev_queue_xmit(skb);
+}
+
+static u16 virtnet_bypass_select_queue(struct net_device *dev,
+				       struct sk_buff *skb, void *accel_priv,
+				       select_queue_fallback_t fallback)
+{
+	/* This helper function exists to help dev_pick_tx get the correct
+	 * destination queue.  Using a helper function skips a call to
+	 * skb_tx_hash and will put the skbs in the queue we expect on their
+	 * way down to the bonding driver.
+	 */
+	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+
+	/* Save the original txq to restore before passing to the driver */
+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+	if (unlikely(txq >= dev->real_num_tx_queues)) {
+		do {
+			txq -= dev->real_num_tx_queues;
+		} while (txq >= dev->real_num_tx_queues);
+	}
+
+	return txq;
+}
+
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void virtnet_bypass_fold_stats(struct rtnl_link_stats64 *_res,
+				      const struct rtnl_link_stats64 *_new,
+				      const struct rtnl_link_stats64 *_old)
+{
+	const u64 *new = (const u64 *)_new;
+	const u64 *old = (const u64 *)_old;
+	u64 *res = (u64 *)_res;
+	int i;
+
+	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+		u64 nv = new[i];
+		u64 ov = old[i];
+		s64 delta = nv - ov;
+
+		/* detects if this particular field is 32bit only */
+		if (((nv | ov) >> 32) == 0)
+			delta = (s64)(s32)((u32)nv - (u32)ov);
+
+		/* filter anomalies, some drivers reset their stats
+		 * at down/up events.
+		 */
+		if (delta > 0)
+			res[i] += delta;
+	}
+}
+
+static void virtnet_bypass_get_stats(struct net_device *dev,
+				     struct rtnl_link_stats64 *stats)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	const struct rtnl_link_stats64 *new;
+	struct rtnl_link_stats64 temp;
+	struct net_device *child_netdev;
+
+	spin_lock(&vbi->stats_lock);
+	memcpy(stats, &vbi->bypass_stats, sizeof(*stats));
+
+	rcu_read_lock();
+
+	child_netdev = rcu_dereference(vbi->active_netdev);
+	if (child_netdev) {
+		new = dev_get_stats(child_netdev, &temp);
+		virtnet_bypass_fold_stats(stats, new, &vbi->active_stats);
+		memcpy(&vbi->active_stats, new, sizeof(*new));
+	}
+
+	child_netdev = rcu_dereference(vbi->backup_netdev);
+	if (child_netdev) {
+		new = dev_get_stats(child_netdev, &temp);
+		virtnet_bypass_fold_stats(stats, new, &vbi->backup_stats);
+		memcpy(&vbi->backup_stats, new, sizeof(*new));
+	}
+
+	rcu_read_unlock();
+
+	memcpy(&vbi->bypass_stats, stats, sizeof(*stats));
+	spin_unlock(&vbi->stats_lock);
+}
+
+static int virtnet_bypass_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	struct net_device *active_netdev, *backup_netdev;
+	int ret = 0;
+
+	active_netdev = rcu_dereference(vbi->active_netdev);
+	if (active_netdev) {
+		ret = dev_set_mtu(active_netdev, new_mtu);
+		if (ret)
+			return ret;
+	}
+
+	backup_netdev = rcu_dereference(vbi->backup_netdev);
+	if (backup_netdev) {
+		ret = dev_set_mtu(backup_netdev, new_mtu);
+		if (ret) {
+			dev_set_mtu(active_netdev, dev->mtu);
+			return ret;
+		}
+	}
+
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static void virtnet_bypass_set_rx_mode(struct net_device *dev)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	struct net_device *child_netdev;
+
+	rcu_read_lock();
+
+	child_netdev = rcu_dereference(vbi->active_netdev);
+	if (child_netdev) {
+		dev_uc_sync_multiple(child_netdev, dev);
+		dev_mc_sync_multiple(child_netdev, dev);
+	}
+
+	child_netdev = rcu_dereference(vbi->backup_netdev);
+	if (child_netdev) {
+		dev_uc_sync_multiple(child_netdev, dev);
+		dev_mc_sync_multiple(child_netdev, dev);
+	}
+
+	rcu_read_unlock();
+}
+
+static const struct net_device_ops virtnet_bypass_netdev_ops = {
+	.ndo_open		= virtnet_bypass_open,
+	.ndo_stop		= virtnet_bypass_close,
+	.ndo_start_xmit		= virtnet_bypass_start_xmit,
+	.ndo_select_queue	= virtnet_bypass_select_queue,
+	.ndo_get_stats64	= virtnet_bypass_get_stats,
+	.ndo_change_mtu		= virtnet_bypass_change_mtu,
+	.ndo_set_rx_mode	= virtnet_bypass_set_rx_mode,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_features_check	= passthru_features_check,
+};
+
+static int
+virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
+					  struct ethtool_link_ksettings *cmd)
+{
+	struct virtnet_bypass_info *vbi = netdev_priv(dev);
+	struct net_device *child_netdev;
+
+	child_netdev = rtnl_dereference(vbi->active_netdev);
+	if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
+		child_netdev = rtnl_dereference(vbi->backup_netdev);
+		if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
+			cmd->base.duplex = DUPLEX_UNKNOWN;
+			cmd->base.port = PORT_OTHER;
+			cmd->base.speed = SPEED_UNKNOWN;
+
+			return 0;
+		}
+	}
+
+	return __ethtool_get_link_ksettings(child_netdev, cmd);
+}
+
+#define BYPASS_DRV_NAME "virtnet_bypass"
+#define BYPASS_DRV_VERSION "0.1"
+
+static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
+					       struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
+}
+
+static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
+	.get_drvinfo            = virtnet_bypass_ethtool_get_drvinfo,
+	.get_link               = ethtool_op_get_link,
+	.get_link_ksettings     = virtnet_bypass_ethtool_get_link_ksettings,
+};
+
+static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
+				     struct net_device *child_netdev)
+{
+	struct virtnet_bypass_info *vbi;
+	bool backup;
+
+	vbi = netdev_priv(bypass_netdev);
+	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
+	if (backup ? rtnl_dereference(vbi->backup_netdev) :
+			rtnl_dereference(vbi->active_netdev)) {
+		netdev_info(bypass_netdev,
+			    "%s attempting to join bypass dev when %s already present\n",
+			    child_netdev->name, backup ? "backup" : "active");
+		return -EEXIST;
+	}
+
+	dev_hold(child_netdev);
+
+	if (backup) {
+		rcu_assign_pointer(vbi->backup_netdev, child_netdev);
+		dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
+	} else {
+		rcu_assign_pointer(vbi->active_netdev, child_netdev);
+		dev_get_stats(vbi->active_netdev, &vbi->active_stats);
+		bypass_netdev->min_mtu = child_netdev->min_mtu;
+		bypass_netdev->max_mtu = child_netdev->max_mtu;
+	}
+
+	netdev_info(bypass_netdev, "child:%s joined\n", child_netdev->name);
+
+	return 0;
+}
+
+static int virtnet_bypass_register_child(struct net_device *bypass_netdev,
+					 struct net_device *child_netdev)
+{
+	struct virtnet_bypass_info *vbi;
+	bool backup;
+
+	vbi = netdev_priv(bypass_netdev);
+	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
+	if (backup ? rtnl_dereference(vbi->backup_netdev) :
+			rtnl_dereference(vbi->active_netdev)) {
+		netdev_info(bypass_netdev,
+			    "%s attempting to register bypass dev when %s already present\n",
+			    child_netdev->name, backup ? "backup" : "active");
+		return -EEXIST;
+	}
+
+	/* Avoid non pci devices as active netdev */
+	if (!backup && (!child_netdev->dev.parent ||
+			!dev_is_pci(child_netdev->dev.parent)))
+		return -EINVAL;
+
+	netdev_info(bypass_netdev, "child:%s registered\n", child_netdev->name);
+
+	return 0;
+}
+
+static int virtnet_bypass_release_child(struct net_device *bypass_netdev,
+					struct net_device *child_netdev)
+{
+	struct net_device *backup_netdev, *active_netdev;
+	struct virtnet_bypass_info *vbi;
+
+	vbi = netdev_priv(bypass_netdev);
+	active_netdev = rtnl_dereference(vbi->active_netdev);
+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
+
+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
+		return -EINVAL;
+
+	netdev_info(bypass_netdev, "child:%s released\n", child_netdev->name);
+
+	return 0;
+}
+
+static int virtnet_bypass_unregister_child(struct net_device *bypass_netdev,
+					   struct net_device *child_netdev)
+{
+	struct net_device *backup_netdev, *active_netdev;
+	struct virtnet_bypass_info *vbi;
+
+	vbi = netdev_priv(bypass_netdev);
+	active_netdev = rtnl_dereference(vbi->active_netdev);
+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
+
+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
+		return -EINVAL;
+
+	if (child_netdev == backup_netdev) {
+		RCU_INIT_POINTER(vbi->backup_netdev, NULL);
+	} else {
+		RCU_INIT_POINTER(vbi->active_netdev, NULL);
+		if (backup_netdev) {
+			bypass_netdev->min_mtu = backup_netdev->min_mtu;
+			bypass_netdev->max_mtu = backup_netdev->max_mtu;
+		}
+	}
+
+	dev_put(child_netdev);
+
+	netdev_info(bypass_netdev, "child:%s unregistered\n",
+		    child_netdev->name);
+
+	return 0;
+}
+
+static int virtnet_bypass_update_link(struct net_device *bypass_netdev,
+				      struct net_device *child_netdev)
+{
+	struct net_device *active_netdev, *backup_netdev;
+	struct virtnet_bypass_info *vbi;
+
+	if (!netif_running(bypass_netdev))
+		return 0;
+
+	vbi = netdev_priv(bypass_netdev);
+
+	active_netdev = rtnl_dereference(vbi->active_netdev);
+	backup_netdev = rtnl_dereference(vbi->backup_netdev);
+
+	if (child_netdev != active_netdev && child_netdev != backup_netdev)
+		return -EINVAL;
+
+	if ((active_netdev && virtnet_bypass_xmit_ready(active_netdev)) ||
+	    (backup_netdev && virtnet_bypass_xmit_ready(backup_netdev))) {
+		netif_carrier_on(bypass_netdev);
+		netif_tx_wake_all_queues(bypass_netdev);
+	} else {
+		netif_carrier_off(bypass_netdev);
+		netif_tx_stop_all_queues(bypass_netdev);
+	}
+
+	return 0;
+}
+
+/* Called when child dev is injecting data into network stack.
+ * Change the associated network device from lower dev to virtio.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
+
+	skb->dev = ndev;
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static const struct bypass_ops virtnet_bypass_ops = {
+	.register_child		= virtnet_bypass_register_child,
+	.join_child		= virtnet_bypass_join_child,
+	.unregister_child	= virtnet_bypass_unregister_child,
+	.release_child		= virtnet_bypass_release_child,
+	.update_link		= virtnet_bypass_update_link,
+	.handle_frame		= virtnet_bypass_handle_frame,
+};
+
+static struct bypass *virtnet_bypass;
+
+static int virtnet_bypass_create(struct virtnet_info *vi)
+{
+	struct net_device *backup_netdev = vi->dev;
+	struct device *dev = &vi->vdev->dev;
+	struct net_device *bypass_netdev;
+	int res;
+
+	/* Alloc at least 2 queues, for now we are going with 16 assuming
+	 * that most devices being bonded won't have too many queues.
+	 */
+	bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
+					  16);
+	if (!bypass_netdev) {
+		dev_err(dev, "Unable to allocate bypass_netdev!\n");
+		return -ENOMEM;
+	}
+
+	dev_net_set(bypass_netdev, dev_net(backup_netdev));
+	SET_NETDEV_DEV(bypass_netdev, dev);
+
+	bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
+	bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
+
+	/* Initialize the device options */
+	bypass_netdev->flags |= IFF_MASTER;
+	bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
+	bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
+				       IFF_TX_SKB_SHARING);
+
+	/* don't acquire bypass netdev's netif_tx_lock when transmitting */
+	bypass_netdev->features |= NETIF_F_LLTX;
+
+	/* Don't allow bypass devices to change network namespaces. */
+	bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
+
+	bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
+				     NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+				     NETIF_F_HIGHDMA | NETIF_F_LRO;
+
+	bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	bypass_netdev->features |= bypass_netdev->hw_features;
+
+	/* For now treat bypass netdev as VLAN challenged since we
+	 * cannot assume VLAN functionality with a VF
+	 */
+	bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
+
+	memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
+	       bypass_netdev->addr_len);
+
+	bypass_netdev->min_mtu = backup_netdev->min_mtu;
+	bypass_netdev->max_mtu = backup_netdev->max_mtu;
+
+	res = register_netdev(bypass_netdev);
+	if (res < 0) {
+		dev_err(dev, "Unable to register bypass_netdev!\n");
+		goto err_register_netdev;
+	}
+
+	netif_carrier_off(bypass_netdev);
+
+	res = bypass_register_instance(virtnet_bypass, bypass_netdev);
+	if (res < 0)
+		goto err_bypass;
+
+	rcu_assign_pointer(vi->bypass_netdev, bypass_netdev);
+
+	return 0;
+
+err_bypass:
+	unregister_netdev(bypass_netdev);
+err_register_netdev:
+	free_netdev(bypass_netdev);
+
+	return res;
+}
+
+static void virtnet_bypass_destroy(struct virtnet_info *vi)
+{
+	struct net_device *bypass_netdev;
+	struct virtnet_bypass_info *vbi;
+	struct net_device *child_netdev;
+
+	bypass_netdev = rcu_dereference(vi->bypass_netdev);
+	/* no device found, nothing to free */
+	if (!bypass_netdev)
+		return;
+
+	vbi = netdev_priv(bypass_netdev);
+
+	netif_device_detach(bypass_netdev);
+
+	rtnl_lock();
+
+	child_netdev = rtnl_dereference(vbi->active_netdev);
+	if (child_netdev)
+		bypass_unregister_child(child_netdev);
+
+	child_netdev = rtnl_dereference(vbi->backup_netdev);
+	if (child_netdev)
+		bypass_unregister_child(child_netdev);
+
+	unregister_netdevice(bypass_netdev);
+
+	bypass_unregister_instance(virtnet_bypass, bypass_netdev);
+
+	rtnl_unlock();
+
+	free_netdev(bypass_netdev);
+}
+
+/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err = -ENOMEM;
@@ -2839,10 +3432,15 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	virtnet_init_settings(dev);
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
+		if (virtnet_bypass_create(vi) != 0)
+			goto free_vqs;
+	}
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
-		goto free_vqs;
+		goto free_bypass;
 	}
 
 	virtio_device_ready(vdev);
@@ -2879,6 +3477,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	vi->vdev->config->reset(vdev);
 
 	unregister_netdev(dev);
+free_bypass:
+	virtnet_bypass_destroy(vi);
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
@@ -2913,6 +3513,8 @@ static void virtnet_remove(struct virtio_device *vdev)
 
 	unregister_netdev(vi->dev);
 
+	virtnet_bypass_destroy(vi);
+
 	remove_vq_common(vi);
 
 	free_netdev(vi->dev);
@@ -2996,6 +3598,11 @@ static __init int virtio_net_driver_init(void)
 {
 	int ret;
 
+	virtnet_bypass = bypass_register_driver(&virtnet_bypass_ops,
+						&virtnet_bypass_netdev_ops);
+	if (!virtnet_bypass)
+		return -ENOMEM;
+
 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
 				      virtnet_cpu_online,
 				      virtnet_cpu_down_prep);
@@ -3010,12 +3617,14 @@ static __init int virtio_net_driver_init(void)
         ret = register_virtio_driver(&virtio_net_driver);
 	if (ret)
 		goto err_virtio;
+
 	return 0;
 err_virtio:
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
 err_dead:
 	cpuhp_remove_multi_state(virtionet_online);
 out:
+	bypass_unregister_driver(virtnet_bypass);
 	return ret;
 }
 module_init(virtio_net_driver_init);
@@ -3025,6 +3634,7 @@ static __exit void virtio_net_driver_exit(void)
 	unregister_virtio_driver(&virtio_net_driver);
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
 	cpuhp_remove_multi_state(virtionet_online);
+	bypass_unregister_driver(virtnet_bypass);
 }
 module_exit(virtio_net_driver_exit);
 
-- 
2.14.3

^ permalink raw reply related

* [RFC PATCH net-next v5 1/4] virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
From: Sridhar Samudrala @ 2018-04-05 21:08 UTC (permalink / raw)
  To: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
	jasowang, loseweigh, jiri
In-Reply-To: <1522962503-3963-1-git-send-email-sridhar.samudrala@intel.com>

This feature bit can be used by hypervisor to indicate virtio_net device to
act as a backup for another device with the same MAC address.

VIRTIO_NET_F_BACKUP is defined as bit 62 as it is a device feature bit.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/virtio_net.c        | 2 +-
 include/uapi/linux/virtio_net.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec7411e..befb5944f3fd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2962,7 +2962,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
-	VIRTIO_NET_F_SPEED_DUPLEX
+	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_BACKUP
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..c7c35fd1a5ed 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
 
+#define VIRTIO_NET_F_BACKUP	  62	/* Act as backup for another device
+					 * with the same MAC.
+					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
 
 #ifndef VIRTIO_NET_NO_LEGACY
-- 
2.14.3

^ permalink raw reply related

* [RFC PATCH net-next v5 2/4] net: Introduce generic bypass module
From: Sridhar Samudrala @ 2018-04-05 21:08 UTC (permalink / raw)
  To: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
	jasowang, loseweigh, jiri
In-Reply-To: <1522962503-3963-1-git-send-email-sridhar.samudrala@intel.com>

This provides a generic interface for paravirtual drivers to listen
for netdev register/unregister/link change events from pci ethernet
devices with the same MAC and takeover their datapath. The notifier and
event handling code is based on the existing netvsc implementation. A
paravirtual driver can use this module by registering a set of ops and
each instance of the device when it is probed.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/bypass.h |  80 ++++++++++
 net/Kconfig          |  18 +++
 net/core/Makefile    |   1 +
 net/core/bypass.c    | 406 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 505 insertions(+)
 create mode 100644 include/net/bypass.h
 create mode 100644 net/core/bypass.c

diff --git a/include/net/bypass.h b/include/net/bypass.h
new file mode 100644
index 000000000000..e2dd122f951a
--- /dev/null
+++ b/include/net/bypass.h
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _NET_BYPASS_H
+#define _NET_BYPASS_H
+
+#include <linux/netdevice.h>
+
+struct bypass_ops {
+	int (*register_child)(struct net_device *bypass_netdev,
+			      struct net_device *child_netdev);
+	int (*join_child)(struct net_device *bypass_netdev,
+			  struct net_device *child_netdev);
+	int (*unregister_child)(struct net_device *bypass_netdev,
+				struct net_device *child_netdev);
+	int (*release_child)(struct net_device *bypass_netdev,
+			     struct net_device *child_netdev);
+	int (*update_link)(struct net_device *bypass_netdev,
+			   struct net_device *child_netdev);
+	rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
+};
+
+struct bypass_instance {
+	struct list_head list;
+	struct net_device __rcu *bypass_netdev;
+	struct bypass *bypass;
+};
+
+struct bypass {
+	struct list_head list;
+	const struct bypass_ops *ops;
+	const struct net_device_ops *netdev_ops;
+	struct list_head instance_list;
+	struct mutex lock;
+};
+
+#if IS_ENABLED(CONFIG_NET_BYPASS)
+
+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
+				      const struct net_device_ops *netdev_ops);
+void bypass_unregister_driver(struct bypass *bypass);
+
+int bypass_register_instance(struct bypass *bypass, struct net_device *dev);
+int bypass_unregister_instance(struct bypass *bypass, struct net_device	*dev);
+
+int bypass_unregister_child(struct net_device *child_netdev);
+
+#else
+
+static inline
+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
+				      const struct net_device_ops *netdev_ops)
+{
+	return NULL;
+}
+
+static inline void bypass_unregister_driver(struct bypass *bypass)
+{
+}
+
+static inline int bypass_register_instance(struct bypass *bypass,
+					   struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int bypass_unregister_instance(struct bypass *bypass,
+					     struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int bypass_unregister_child(struct net_device *child_netdev)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _NET_BYPASS_H */
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..994445f4a96a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
 	  devlink is a loadable module and the driver using it is built-in.
 
+config NET_BYPASS
+	tristate "Bypass interface"
+	---help---
+	  This provides a generic interface for paravirtual drivers to listen
+	  for netdev register/unregister/link change events from pci ethernet
+	  devices with the same MAC and takeover their datapath. This also
+	  enables live migration of a VM with direct attached VF by failing
+	  over to the paravirtual datapath when the VF is unplugged.
+
+config MAY_USE_BYPASS
+	tristate
+	default m if NET_BYPASS=m
+	default y if NET_BYPASS=y || NET_BYPASS=n
+	help
+	  Drivers using the bypass infrastructure should have a dependency
+	  on MAY_USE_BYPASS to ensure they do not cause link errors when
+	  bypass is a loadable module and the driver using it is built-in.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..a9727ed1c8fc 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_NET_BYPASS) += bypass.o
diff --git a/net/core/bypass.c b/net/core/bypass.c
new file mode 100644
index 000000000000..7bde962ec3d4
--- /dev/null
+++ b/net/core/bypass.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
+#include <uapi/linux/if_arp.h>
+#include <net/bypass.h>
+
+static LIST_HEAD(bypass_list);
+
+static DEFINE_MUTEX(bypass_mutex);
+
+struct bypass_instance *bypass_instance_alloc(struct net_device *dev)
+{
+	struct bypass_instance *bypass_instance;
+
+	bypass_instance = kzalloc(sizeof(*bypass_instance), GFP_KERNEL);
+	if (!bypass_instance)
+		return NULL;
+
+	dev_hold(dev);
+	rcu_assign_pointer(bypass_instance->bypass_netdev, dev);
+
+	return bypass_instance;
+}
+
+void bypass_instance_free(struct bypass_instance *bypass_instance)
+{
+	struct net_device *bypass_netdev;
+
+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+
+	dev_put(bypass_netdev);
+	kfree(bypass_instance);
+}
+
+static struct bypass_instance *bypass_get_instance_bymac(u8 *mac)
+{
+	struct bypass_instance *bypass_instance;
+	struct net_device *bypass_netdev;
+	struct bypass *bypass;
+
+	list_for_each_entry(bypass, &bypass_list, list) {
+		mutex_lock(&bypass->lock);
+		list_for_each_entry(bypass_instance, &bypass->instance_list,
+				    list) {
+			bypass_netdev =
+				rcu_dereference(bypass_instance->bypass_netdev);
+			if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
+				mutex_unlock(&bypass->lock);
+				goto out;
+			}
+		}
+		mutex_unlock(&bypass->lock);
+	}
+
+	bypass_instance = NULL;
+out:
+	return bypass_instance;
+}
+
+static int bypass_register_child(struct net_device *child_netdev)
+{
+	struct bypass_instance *bypass_instance;
+	struct bypass *bypass;
+	struct net_device *bypass_netdev;
+	int ret, orig_mtu;
+
+	ASSERT_RTNL();
+
+	mutex_lock(&bypass_mutex);
+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
+	if (!bypass_instance) {
+		mutex_unlock(&bypass_mutex);
+		goto done;
+	}
+
+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+	bypass = bypass_instance->bypass;
+	mutex_unlock(&bypass_mutex);
+
+	if (!bypass->ops->register_child)
+		goto done;
+
+	ret = bypass->ops->register_child(bypass_netdev, child_netdev);
+	if (ret != 0)
+		goto done;
+
+	ret = netdev_rx_handler_register(child_netdev,
+					 bypass->ops->handle_frame,
+					 bypass_netdev);
+	if (ret != 0) {
+		netdev_err(child_netdev,
+			   "can not register bypass rx handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_upper_dev_link(child_netdev, bypass_netdev, NULL);
+	if (ret != 0) {
+		netdev_err(child_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   bypass_netdev->name, ret);
+		goto upper_link_failed;
+	}
+
+	child_netdev->flags |= IFF_SLAVE;
+
+	if (netif_running(bypass_netdev)) {
+		ret = dev_open(child_netdev);
+		if (ret && (ret != -EBUSY)) {
+			netdev_err(bypass_netdev,
+				   "Opening child %s failed ret:%d\n",
+				   child_netdev->name, ret);
+			goto err_interface_up;
+		}
+	}
+
+	/* Align MTU of child with master */
+	orig_mtu = child_netdev->mtu;
+	ret = dev_set_mtu(child_netdev, bypass_netdev->mtu);
+	if (ret != 0) {
+		netdev_err(bypass_netdev,
+			   "unable to change mtu of %s to %u register failed\n",
+			   child_netdev->name, bypass_netdev->mtu);
+		goto err_set_mtu;
+	}
+
+	ret = bypass->ops->join_child(bypass_netdev, child_netdev);
+	if (ret != 0)
+		goto err_join;
+
+	call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
+
+	goto done;
+
+err_join:
+	dev_set_mtu(child_netdev, orig_mtu);
+err_set_mtu:
+	dev_close(child_netdev);
+err_interface_up:
+	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
+	child_netdev->flags &= ~IFF_SLAVE;
+upper_link_failed:
+	netdev_rx_handler_unregister(child_netdev);
+rx_handler_failed:
+	bypass->ops->unregister_child(bypass_netdev, child_netdev);
+
+done:
+	return NOTIFY_DONE;
+}
+
+int bypass_unregister_child(struct net_device *child_netdev)
+{
+	struct bypass_instance *bypass_instance;
+	struct net_device *bypass_netdev;
+	struct bypass *bypass;
+	int ret;
+
+	ASSERT_RTNL();
+
+	mutex_lock(&bypass_mutex);
+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
+	if (!bypass_instance) {
+		mutex_unlock(&bypass_mutex);
+		goto done;
+	}
+
+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+	bypass = bypass_instance->bypass;
+	mutex_unlock(&bypass_mutex);
+
+	ret = bypass->ops->release_child(bypass_netdev, child_netdev);
+	if (ret != 0)
+		goto done;
+
+	netdev_rx_handler_unregister(child_netdev);
+	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
+	child_netdev->flags &= ~IFF_SLAVE;
+
+	if (!bypass->ops->unregister_child)
+		goto done;
+
+	bypass->ops->unregister_child(bypass_netdev, child_netdev);
+
+done:
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL(bypass_unregister_child);
+
+static int bypass_update_link(struct net_device *child_netdev)
+{
+	struct bypass_instance *bypass_instance;
+	struct net_device *bypass_netdev;
+	struct bypass *bypass;
+
+	ASSERT_RTNL();
+
+	mutex_lock(&bypass_mutex);
+	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
+	if (!bypass_instance) {
+		mutex_unlock(&bypass_mutex);
+		goto done;
+	}
+
+	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+	bypass = bypass_instance->bypass;
+	mutex_unlock(&bypass_mutex);
+
+	if (!bypass->ops->update_link)
+		goto done;
+
+	bypass->ops->update_link(bypass_netdev, child_netdev);
+
+done:
+	return NOTIFY_DONE;
+}
+
+static bool bypass_validate_child_dev(struct net_device *dev)
+{
+	/* Avoid non-Ethernet type devices */
+	if (dev->type != ARPHRD_ETHER)
+		return false;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(dev))
+		return false;
+
+	/* Avoid Bonding master dev with same MAC registering as child dev */
+	if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
+		return false;
+
+	return true;
+}
+
+static int
+bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+	struct bypass *bypass;
+
+	/* Skip Parent events */
+	mutex_lock(&bypass_mutex);
+	list_for_each_entry(bypass, &bypass_list, list) {
+		if (event_dev->netdev_ops == bypass->netdev_ops) {
+			mutex_unlock(&bypass_mutex);
+			return NOTIFY_DONE;
+		}
+	}
+	mutex_unlock(&bypass_mutex);
+
+	if (!bypass_validate_child_dev(event_dev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return bypass_register_child(event_dev);
+	case NETDEV_UNREGISTER:
+		return bypass_unregister_child(event_dev);
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		return bypass_update_link(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block bypass_notifier = {
+	.notifier_call = bypass_event,
+};
+
+static void bypass_register_existing_child(struct net_device *bypass_netdev)
+{
+	struct net *net = dev_net(bypass_netdev);
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(net, dev) {
+		if (dev == bypass_netdev)
+			continue;
+		if (!bypass_validate_child_dev(dev))
+			continue;
+		if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
+			bypass_register_child(dev);
+	}
+	rtnl_unlock();
+}
+
+int bypass_register_instance(struct bypass *bypass, struct net_device *dev)
+{
+	struct bypass_instance *bypass_instance;
+	struct net_device *bypass_netdev;
+	int ret = 0;
+
+	mutex_lock(&bypass->lock);
+	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
+		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+		if (bypass_netdev == dev) {
+			ret = -EEXIST;
+			goto done;
+		}
+	}
+
+	bypass_instance = bypass_instance_alloc(dev);
+	if (!bypass_instance) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	bypass_instance->bypass = bypass;
+	list_add_tail(&bypass_instance->list, &bypass->instance_list);
+
+done:
+	mutex_unlock(&bypass->lock);
+	bypass_register_existing_child(dev);
+	return ret;
+}
+EXPORT_SYMBOL(bypass_register_instance);
+
+int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev)
+{
+	struct bypass_instance *bypass_instance;
+	struct net_device *bypass_netdev;
+	int ret = 0;
+
+	mutex_lock(&bypass->lock);
+	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
+		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
+		if (bypass_netdev == dev) {
+			list_del(&bypass_instance->list);
+			bypass_instance_free(bypass_instance);
+			goto done;
+		}
+	}
+
+	ret = -ENOENT;
+done:
+	mutex_unlock(&bypass->lock);
+	return ret;
+}
+EXPORT_SYMBOL(bypass_unregister_instance);
+
+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
+				      const struct net_device_ops *netdev_ops)
+{
+	struct bypass *bypass;
+
+	bypass = kzalloc(sizeof(*bypass), GFP_KERNEL);
+	if (!bypass)
+		return NULL;
+
+	bypass->ops = ops;
+	bypass->netdev_ops = netdev_ops;
+	INIT_LIST_HEAD(&bypass->instance_list);
+
+	mutex_lock(&bypass_mutex);
+	list_add_tail(&bypass->list, &bypass_list);
+	mutex_unlock(&bypass_mutex);
+
+	return bypass;
+}
+EXPORT_SYMBOL_GPL(bypass_register_driver);
+
+void bypass_unregister_driver(struct bypass *bypass)
+{
+	mutex_lock(&bypass_mutex);
+	list_del(&bypass->list);
+	mutex_unlock(&bypass_mutex);
+
+	kfree(bypass);
+}
+EXPORT_SYMBOL_GPL(bypass_unregister_driver);
+
+static __init int
+bypass_init(void)
+{
+	register_netdevice_notifier(&bypass_notifier);
+
+	return 0;
+}
+module_init(bypass_init);
+
+static __exit
+void bypass_exit(void)
+{
+	unregister_netdevice_notifier(&bypass_notifier);
+}
+module_exit(bypass_exit);
+
+MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
+MODULE_LICENSE("GPL v2");
-- 
2.14.3

^ permalink raw reply related

* [RFC PATCH net-next v5 0/4] Enable virtio_net to act as a backup for a passthru device
From: Sridhar Samudrala @ 2018-04-05 21:08 UTC (permalink / raw)
  To: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
	jasowang, loseweigh, jiri

The main motivation for this patch is to enable cloud service providers
to provide an accelerated datapath to virtio-net enabled VMs in a 
transparent manner with no/minimal guest userspace changes. This also
enables hypervisor controlled live migration to be supported with VMs that
have direct attached SR-IOV VF devices.

Patch 1 introduces a new feature bit VIRTIO_NET_F_BACKUP that can be
used by hypervisor to indicate that virtio_net interface should act as
a backup for another device with the same MAC address.

Patch 2 introduces a bypass module that provides a generic interface for 
paravirtual drivers to listen for netdev register/unregister/link change
events from pci ethernet devices with the same MAC and takeover their
datapath. The notifier and event handling code is based on the existing
netvsc implementation. A paravirtual driver can use this module by
registering a set of ops and each instance of the device when it is probed.

Patch 3 extends virtio_net to use alternate datapath when available and
registered. When BACKUP feature is enabled, virtio_net driver creates
an additional 'bypass' netdev that acts as a master device and controls
2 slave devices.  The original virtio_net netdev is registered as
'backup' netdev and a passthru/vf device with the same MAC gets
registered as 'active' netdev. Both 'bypass' and 'backup' netdevs are
associated with the same 'pci' device.  The user accesses the network
interface via 'bypass' netdev. The 'bypass' netdev chooses 'active' netdev
as default for transmits when it is available with link up and running.

Patch 4 refactors netvsc to use the registration/notification framework
supported by bypass module.

As this patch series is initially focusing on usecases where hypervisor 
fully controls the VM networking and the guest is not expected to directly 
configure any hardware settings, it doesn't expose all the ndo/ethtool ops
that are supported by virtio_net at this time. To support additional usecases,
it should be possible to enable additional ops later by caching the state
in virtio netdev and replaying when the 'active' netdev gets registered. 
 
The hypervisor needs to enable only one datapath at any time so that packets
don't get looped back to the VM over the other datapath. When a VF is
plugged, the virtio datapath link state can be marked as down.
At the time of live migration, the hypervisor needs to unplug the VF device
from the guest on the source host and reset the MAC filter of the VF to
initiate failover of datapath to virtio before starting the migration. After
the migration is completed, the destination hypervisor sets the MAC filter
on the VF and plugs it back to the guest to switch over to VF datapath.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

v5 RFC:
  Based on Jiri's comments, moved the common functionality to a 'bypass'
  module so that the same notifier and event handlers to handle child
  register/unregister/link change events can be shared between virtio_net
  and netvsc.
  Improved error handling based on Siwei's comments.
v4:
- Based on the review comments on the v3 version of the RFC patch and
  Jakub's suggestion for the naming issue with 3 netdev solution,
  proposed 3 netdev in-driver bonding solution for virtio-net.
v3 RFC:
- Introduced 3 netdev model and pointed out a couple of issues with
  that model and proposed 2 netdev model to avoid these issues.
- Removed broadcast/multicast optimization and only use virtio as
  backup path when VF is unplugged.
v2 RFC:
- Changed VIRTIO_NET_F_MASTER to VIRTIO_NET_F_BACKUP (mst)
- made a small change to the virtio-net xmit path to only use VF datapath
  for unicasts. Broadcasts/multicasts use virtio datapath. This avoids
  east-west broadcasts to go over the PCI link.
- added suppport for the feature bit in qemu

Sridhar Samudrala (4):
  virtio_net: Introduce VIRTIO_NET_F_BACKUP feature bit
  net: Introduce generic bypass module
  virtio_net: Extend virtio to use VF datapath when available
  netvsc: refactor notifier/event handling code to use the bypass
    framework

 drivers/net/Kconfig             |   1 +
 drivers/net/hyperv/Kconfig      |   1 +
 drivers/net/hyperv/netvsc_drv.c | 219 ++++----------
 drivers/net/virtio_net.c        | 614 +++++++++++++++++++++++++++++++++++++++-
 include/net/bypass.h            |  80 ++++++
 include/uapi/linux/virtio_net.h |   3 +
 net/Kconfig                     |  18 ++
 net/core/Makefile               |   1 +
 net/core/bypass.c               | 406 ++++++++++++++++++++++++++
 9 files changed, 1184 insertions(+), 159 deletions(-)
 create mode 100644 include/net/bypass.h
 create mode 100644 net/core/bypass.c

-- 
2.14.3

^ permalink raw reply

* Re: [PATCH net-next 6/6] netdevsim: Add simple FIB resource controller via devlink
From: David Ahern @ 2018-04-05 21:06 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, roopa, shm, jiri, idosch, jakub.kicinski,
	andy.roulin
In-Reply-To: <a916f016-5d8b-3f56-0a84-95d1712bec4c@cumulusnetworks.com>

On 4/5/18 2:10 PM, David Ahern wrote:
> 
> The ASIC here is the kernel tables in a namespace. It does not make
> sense to have 2 devlink instances for a single namespace.

I put this example controller in netdevsim per a suggestion from Ido.
The netdevsim seemed like a good idea given that modules intention --
testing network facilities. Perhaps I should have done this as a
completely standalone module ...

The intention is to treat the kernel's tables *per namespace* as a
standalone entity that can be managed very similar to ASIC resources.
Given that I can add a resource controller module
(drivers/net/kern_res_mgr.c?) that creates a 'struct device' per network
namespace with a devlink instance. In this case the device would very
much be tied to the namespace 1:1.

^ permalink raw reply

* Re: [patch net] devlink: convert occ_get op to separate registration
From: David Ahern @ 2018-04-05 20:58 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, idosch, jakub.kicinski, mlxsw
In-Reply-To: <0880af20-f0a2-bc19-6b1d-0e42a7937184@gmail.com>

On 4/5/18 2:55 PM, David Ahern wrote:
> @@ -154,6 +155,12 @@ void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
>  }
>  EXPORT_SYMBOL(mlxsw_core_driver_priv);
>  
> +bool mlxsw_core_reload_in_progress(struct mlxsw_core *mlxsw_core)
> +{
> +	return mlxsw_core->mlxsw_core_driver_priv;

Oops, that is supposed to be:
	return mlxsw_core->reload_in_progress;

but I think you get the point.

> +}
> +EXPORT_SYMBOL(mlxsw_core_reload_in_progress);
> +
>  struct mlxsw_rx_listener_item {
>  	struct list_head list;
>  	struct mlxsw_rx_listener rxl;

^ permalink raw reply

* Re: [patch net] devlink: convert occ_get op to separate registration
From: David Ahern @ 2018-04-05 20:55 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, idosch, jakub.kicinski, mlxsw
In-Reply-To: <20180405201321.16626-1-jiri@resnulli.us>

[-- Attachment #1: Type: text/plain, Size: 1891 bytes --]

On 4/5/18 2:13 PM, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> This resolves race during initialization where the resources with
> ops are registered before driver and the structures used by occ_get
> op is initialized. So keep occ_get callbacks registered only when
> all structs are initialized.
> 
> The example flows, as it is in mlxsw:
> 1) driver load/asic probe:
>    mlxsw_core
>       -> mlxsw_sp_resources_register
>         -> mlxsw_sp_kvdl_resources_register
>           -> devlink_resource_register IDX
>    mlxsw_spectrum
>       -> mlxsw_sp_kvdl_init
>         -> mlxsw_sp_kvdl_parts_init
>           -> mlxsw_sp_kvdl_part_init
>             -> devlink_resource_size_get IDX (to get the current setup
>                                               size from devlink)
>         -> devlink_resource_occ_get_register IDX (register current
>                                                   occupancy getter)
> 2) reload triggered by devlink command:
>   -> mlxsw_devlink_core_bus_device_reload
>     -> mlxsw_sp_fini
>       -> mlxsw_sp_kvdl_fini
> 	-> devlink_resource_occ_get_unregister IDX
>     (struct mlxsw_sp *mlxsw_sp is freed at this point, call to occ get
>      which is using mlxsw_sp would cause use-after free)
>     -> mlxsw_sp_init
>       -> mlxsw_sp_kvdl_init
>         -> mlxsw_sp_kvdl_parts_init
>           -> mlxsw_sp_kvdl_part_init
>             -> devlink_resource_size_get IDX (to get the current setup
>                                               size from devlink)
>         -> devlink_resource_occ_get_register IDX (register current
>                                                   occupancy getter)
> 
> Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---


Why won't something like the attached work as opposed to playing
register / unregister games?

[-- Attachment #2: mlxsw-reload.patch --]
[-- Type: text/plain, Size: 4748 bytes --]

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 93ea56620a24..dcded613faa6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -113,6 +113,7 @@ struct mlxsw_core {
 	struct mlxsw_thermal *thermal;
 	struct mlxsw_core_port *ports;
 	unsigned int max_ports;
+	bool reload_in_progress;
 	bool reload_fail;
 	unsigned long driver_priv[0];
 	/* driver_priv has to be always the last item */
@@ -154,6 +155,12 @@ void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
 }
 EXPORT_SYMBOL(mlxsw_core_driver_priv);
 
+bool mlxsw_core_reload_in_progress(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_core->mlxsw_core_driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_reload_in_progress);
+
 struct mlxsw_rx_listener_item {
 	struct list_head list;
 	struct mlxsw_rx_listener rxl;
@@ -972,12 +979,16 @@ static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
 	if (!mlxsw_bus->reset)
 		return -EOPNOTSUPP;
 
+	mlxsw_core->reload_in_progress = true;
+
 	mlxsw_core_bus_device_unregister(mlxsw_core, true);
 	mlxsw_bus->reset(mlxsw_core->bus_priv);
 	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
 					     mlxsw_core->bus,
 					     mlxsw_core->bus_priv, true,
 					     devlink);
+	mlxsw_core->reload_in_progress = false;
+
 	if (err)
 		mlxsw_core->reload_fail = true;
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 092d39399f3c..ffa1db154757 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -60,6 +60,7 @@ struct mlxsw_bus_info;
 unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core);
 
 void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core);
+bool mlxsw_core_reload_in_progress(struct mlxsw_core *mlxsw_core);
 
 int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver);
 void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 53fffd09d133..09b89af37d8a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3808,8 +3808,12 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
 static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp *mlxsw_sp;
+
+	if (mlxsw_core_reload_in_progress(mlxsw_core))
+		return 0;
 
+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 8796db44dcc3..dd66285bafb5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -329,9 +329,13 @@ u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
 static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	struct mlxsw_sp_kvdl_part *part;
+	struct mlxsw_sp *mlxsw_sp;
 
+	if (mlxsw_core_reload_in_progress(mlxsw_core))
+		return 0;
+
+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE];
 	return mlxsw_sp_kvdl_part_occ(part);
 }
@@ -339,8 +343,13 @@ static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
 static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	struct mlxsw_sp_kvdl_part *part;
+	struct mlxsw_sp *mlxsw_sp;
+
+	if (mlxsw_core_reload_in_progress(mlxsw_core))
+		return 0;
+
+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS];
 	return mlxsw_sp_kvdl_part_occ(part);
@@ -349,9 +358,13 @@ static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
 static u64 mlxsw_sp_kvdl_large_chunks_occ_get(struct devlink *devlink)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	struct mlxsw_sp_kvdl_part *part;
+	struct mlxsw_sp *mlxsw_sp;
+
+	if (mlxsw_core_reload_in_progress(mlxsw_core))
+		return 0;
 
+	mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS];
 	return mlxsw_sp_kvdl_part_occ(part);
 }

^ permalink raw reply related

* Re: Best userspace programming API for XDP features query to kernel?
From: Jesper Dangaard Brouer via iovisor-dev @ 2018-04-05 20:51 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: oisf-devel-ZwoEplunGu2j570ONfqVQLVmwVP6tfMwSoIsB4E12gc,
	Jakub Kicinski, Alexei Starovoitov, Victor Julien,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org,
	Peter Manev, Jiri Benc, Saeed Mahameed, Eric Leblond,
	Daniel Borkmann
In-Reply-To: <95ac793b-66aa-b53c-a9bc-c86cbc47ee1a-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org>

On Thu, 5 Apr 2018 12:37:19 +0200
Daniel Borkmann <daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org> wrote:

> On 04/04/2018 02:28 PM, Jesper Dangaard Brouer via iovisor-dev wrote:
> > Hi Suricata people,
> > 
> > When Eric Leblond (and I helped) integrated XDP in Suricata, we ran
> > into the issue, that at Suricata load/start time, we cannot determine
> > if the chosen XDP config options, like xdp-cpu-redirect[1], is valid on
> > this HW (e.g require driver XDP_REDIRECT support and bpf cpumap).
> > 
> > We would have liked a way to report that suricata.yaml config was
> > invalid for this hardware/setup.  Now, it just loads, and packets gets
> > silently dropped by XDP (well a WARN_ONCE and catchable via tracepoints).
> > 
> > My question to suricata developers: (Q1) Do you already have code that
> > query the kernel or drivers for features?
> > 
> > At the IOvisor call (2 weeks ago), we discussed two options of exposing
> > XDP features avail in a given driver.
> > 
> > Option#1: Extend existing ethtool -k/-K "offload and other features"
> > with some XDP features, that userspace can query. (Do you already query
> > offloads, regarding Q1)
> > 
> > Option#2: Invent a new 'ip link set xdp' netlink msg with a query option.  
> 
> I don't really mind if you go via ethtool, as long as we handle this
> generically from there and e.g. call the dev's ndo_bpf handler such that
> we keep all the information in one place. This can be a new ndo_bpf command
> e.g. XDP_QUERY_FEATURES or such.

Just to be clear: notice as Victor points out[2], they are programmable
going though the IOCTL (SIOCETHTOOL) and not using cmdline tools.

[2] https://github.com/OISF/suricata/blob/master/src/util-ioctl.c#L326

If you want everything to go through the drivers ndo_bpf call anyway
(which userspace API is netlink based) then at what point to you
want drivers to call their own ndo_bpf, when activated though their
ethtool_ops ? (Sorry, but I don't follow the flow you are proposing)

Notice, I'm not directly against using the drivers ndo_bpf call.  I can
see it does provide kernel more flexibility than the ethtool IOCTL.


> More specifically, how would such feature mask look like? How fine grained
> would this be? When you add a new minor feature to, say, cpumap that not
> all drivers support yet, we'd need a new flag each time, no?

No, CPUMAP is not a driver level feature, and thus does not require a
features flag exposed by the driver.  CPUMAP depends on the driver
feature XDP_REDIRECT.

It is important we separate driver-level features and BPF/XDP core
features.  I feel that we constantly talk past each-other when we mix
that up.

It is true that Suricata _also_ need to detect if the running kernel
support the map type called: BPF_MAP_TYPE_CPUMAP.  *BUT* that is a
completely separate mechanism.  It is a core kernel bpf feature, and I
have accepted that this can only be done via probing the kernel (simply
use the bpf syscall and try to create this map type).

Here, I want to discuss how drivers expose/tell userspace that they
support a given feature: Specifically a bit for: XDP_REDIRECT action
support.


> Same for meta data,

Well, not really.  It would be a "nice-to-have", but not strictly
needed as a feature bit.  XDP meta-data is controlled via a helper.
And the BPF-prog can detect/see runtime, that the helper bpf_xdp_adjust_meta
returns -ENOTSUPP (and need to check the ret value anyhow).  Thus,
there is that not much gained by exposing this to be detected setup
time, as all drivers should eventually support this, and we can detect
it runtime.

The missing XDP_REDIRECT action features bit it different, as the
BPF-prog cannot detect runtime that this is an unsupported action.
Plus, setup time we cannot query the driver for supported XDP actions.


> then potentially for the redirect memory return work,

I'm not sure the redirect memory return types, belong here.  First of
all it is per RX-ring.  Second, some other config method will likely be
config interface.  Like with AF_XDP-zero-copy it is a new NDO. So, it
is more losely coupled.

> or the af_xdp bits, 

No need for bit for AF_XDP in copy-mode (current RFC), as this only
depend on driver supporting XDP_REDIRECT action.

For AF_XDP in zero-copy mode, then yes we need a way for userspace to
"see" if this mode is supported by the driver.  But it might not need a
feature bit here... as the bind() call (which knows the ifindex) could
fail when it tried to enable this ZC mode.  It would make userspace's
live easier to add ZC as a driver feature bit.


> the xdp_rxq_info would have needed it, etc. 

Same comment as mem-type, not necessarily, as it is more losely coupled
to XDP.

> What about nfp in terms of XDP
> offload capabilities, should they be included as well or is probing to load
> the program and see if it loads/JITs as we do today just fine (e.g. you'd
> otherwise end up with extra flags on a per BPF helper basis)?

No, flags per BPF helper basis. As I've described above, helper belong
to the BPF core, not the driver.  Here I want to know what the specific
driver support.

> To make a
> somewhat reliable assertion whether feature xyz would work, this would
> explode in new feature bits long term. Additionally, if we end up with a
> lot of feature flags, it will be very hard for users to determine whether
> this particular set of features a driver supports actually represents a
> fully supported native XDP driver.

Think about it, what does a "fully supported native XDP driver" mean,
when the kernel evolve and new features gets added.  How will the
end-user know what XDP features are in their running kernel release?

> What about keeping this high level to users? E.g. say you have 2 options
> that drivers can expose as netdev_features_strings 'xdp-native-full' or
> 'xdp-native-partial'. If a driver truly supports all XDP features for a
> given kernel e.g. v4.16, then a query like 'ethtool -k foo' will say
> 'xdp-native-full', if at least one feature is missing to be feature complete
> from e.g. above list, then ethtool will tell 'xdp-native-partial', and if
> not even ndo_bpf callback exists then no 'xdp-native-*' is reported.

I use-to-be, an advocate for this.  I even think I send patches
implementing this. Later, I've realized that this model is flawed.

When e.g. suricata loads it need to look at both "xdp-native-full" and
the kernel version, to determine if XDP_REDIRECT action is avail.
Later when a new kernel version gets released, the driver is missing a
new XDP feature.  Then suricata, which doesn't use/need the new
feature, need to be updated, to check that kernel below this version,
with 'xdp-native-partial' and this NIC driver is still okay.  Can you
see the problem?

Even if Suricate goes though the pain of keeping track of kernel
version vs drivers vs xdp-native-full/partial.  Then, they also want to
run their product on distro kernels.  They might convince some distro,
to backport some XDP features they need.  So, now they also need to
keep track of distro kernel minor versions... and all they really
wanted as a single feature bit saying if the running NIC driver
supports the XDP_REDIRECT action code.

 
> Side-effect might be that it would give incentive to keep drivers in
> state 'xdp-native-full' instead of being downgraded to
> 'xdp-native-partial'. Potentially, in the 'xdp-native-partial' state,
> we can expose a high-level list of missing features that the driver
> does not support yet, which would over time converge towards 'zero'
> and thus 'xdp-native-full' again. ethtool itself could get a new XDP
> specific query option that, based on this info, can then dump the
> full list of supported and not supported features. In order for this
> to not explode, such features would need to be kept on a high-level
> basis, meaning if e.g. cpumap gets extended along with support for a
> number of drivers, then those that missed out would need to be
> temporarily re-flagged with e.g. 'cpumap not supported' until it gets
> also implemented there. That way, we don't explode in adding too
> fine-grained feature bit combinations long term and make it easier to
> tell whether a driver supports the full set in native XDP or not.
> Thoughts?

(I really liked creating an incentive for driver vendors)
Thoughs inlined above...
 
> > (Q2) Do Suricata devs have any preference (or other options/ideas)
> > for the way the kernel expose this info to userspace?
> > 
> > [1]
> > http://suricata.readthedocs.io/en/latest/capture-hardware/ebpf-xdp.html#the-xdp-cpu-redirect-case  

Regarding how fine grained features bits I want.  I also want to mention
that, I want the drivers XDP_REDIRECT action support to be decoupled
from whether the driver support ndo_xdp_xmit, and if ndo_xdp_xmit is
enabled or disabled.
  E.g. for the macvlan driver, I don't see much performance gain in
implementing the native XDP-RX actions "side", while there will be a
huge performance gain in implemeting ndo_xdp_xmit.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: marvell switch
From: Andrew Lunn @ 2018-04-05 20:46 UTC (permalink / raw)
  To: Ran Shalit; +Cc: netdev
In-Reply-To: <CAJ2oMhL4=ZrkGaTgTXU-oJFhZP==QDkO4fNSBaaTJF+NHnf9-g@mail.gmail.com>

> > Hi Ran
> >
> > The Marvell driver makes each port act like a normal Linux network
> > interface. So if you want to enable a port, do
> >
> > ip link set lan0 up
> >
> > Want to add an ip address to a port
> >
> > ip addr add 10.42.42.42/24 dev lan0
> >
> > Want to bridge two ports
> >
> > ip link add name br0 type bridge
> > ip link set dev br0 up
> > ip link set dev lan0 master br0
> > ip link set dev lan1 master br0
> >
> > Just treat them as normal interfaces.
> >
> 
> If I may please ask,
> What is the purpose of using bridge for configuring switch interfaces.
> Is it in order to isolate some ports from others?
> I ask because according to my understanding the default configuration of
> the driver is to enable switch in "flat" configuration, i.e. as if all
> ports are connected to each other.

Please think about what i said. They are standard Linux network
interfaces. Do standard Linux network interfaces bridge themselves
together by default? No, you need to configure a bridge.

	 Andrew

^ permalink raw reply

* Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings
From: Andrew Lunn @ 2018-04-05 20:40 UTC (permalink / raw)
  To: Esben Haabendal
  Cc: Florian Fainelli, Richard Cochran,
	open list:PTP HARDWARE CLOCK SUPPORT, open list, Rasmus Villemoes
In-Reply-To: <87d0zdjszu.fsf@haabendal.dk>

On Thu, Apr 05, 2018 at 10:30:45PM +0200, Esben Haabendal wrote:
> Florian Fainelli <f.fainelli@gmail.com> writes:
> 
> > On 04/05/2018 04:44 AM, esben.haabendal@gmail.com wrote:
> >> From: Esben Haabendal <eha@deif.com>
> >> 
> >> Read configration settings, to allow automatic forced speed/duplex setup
> >> by hardware strapping.
> >
> > OK but why? What problem is this solving for you?
> 
> It is ensuring that the PHY is configured according to the HW design.
> If the HW design has set the strap configuration to fx. fixed 100 Mbit
> full-duplex, this avoids Linux configuring it for auto-negotiation.

Hi Esben

Are you sure it contains the HW strapping? Is the driver hitting the
PHY with a hard reset to make it go back to the strapping defaults?
Or could it still contain whatever state the last boot of Linux, or
maybe the bootloader, left the PHY in?

      Andrew

^ permalink raw reply

* [PATCH v2] net: phy: marvell: Enable interrupt function on LED2 pin
From: Esben Haabendal @ 2018-04-05 20:40 UTC (permalink / raw)
  To: netdev
  Cc: Esben Haabendal, Rasmus Villemoes, Andrew Lunn, Florian Fainelli,
	open list
In-Reply-To: <20180405133504.12257-1-esben.haabendal@gmail.com>

From: Esben Haabendal <eha@deif.com>

The LED2[2]/INTn pin on Marvell 88E1318S as well as 88E1510/12/14/18 needs
to be configured to be usable as interrupt not only when WOL is enabled,
but whenever we rely on interrupts from the PHY.

Signed-off-by: Esben Haabendal <eha@deif.com>
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
---
 drivers/net/phy/marvell.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 0e0978d8a0eb..a6ad0255c512 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -828,6 +828,22 @@ static int m88e1121_config_init(struct phy_device *phydev)
 	return marvell_config_init(phydev);
 }
 
+static int m88e1318_config_init(struct phy_device *phydev)
+{
+	if (phy_interrupt_is_valid(phydev)) {
+		int err = phy_modify_paged(
+			phydev, MII_MARVELL_LED_PAGE,
+			MII_88E1318S_PHY_LED_TCR,
+			MII_88E1318S_PHY_LED_TCR_FORCE_INT,
+			MII_88E1318S_PHY_LED_TCR_INTn_ENABLE |
+			MII_88E1318S_PHY_LED_TCR_INT_ACTIVE_LOW);
+		if (err < 0)
+			return err;
+	}
+
+	return m88e1121_config_init(phydev);
+}
+
 static int m88e1510_config_init(struct phy_device *phydev)
 {
 	int err;
@@ -870,7 +886,7 @@ static int m88e1510_config_init(struct phy_device *phydev)
 		phydev->advertising &= ~pause;
 	}
 
-	return m88e1121_config_init(phydev);
+	return m88e1318_config_init(phydev);
 }
 
 static int m88e1118_config_aneg(struct phy_device *phydev)
@@ -2086,7 +2102,7 @@ static struct phy_driver marvell_drivers[] = {
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
 		.probe = marvell_probe,
-		.config_init = &m88e1121_config_init,
+		.config_init = &m88e1318_config_init,
 		.config_aneg = &m88e1318_config_aneg,
 		.read_status = &marvell_read_status,
 		.ack_interrupt = &marvell_ack_interrupt,
-- 
2.16.3

^ permalink raw reply related

* Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings
From: Esben Haabendal @ 2018-04-05 20:34 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Richard Cochran, Andrew Lunn,
	open list:PTP HARDWARE CLOCK SUPPORT, open list, Rasmus Villemoes
In-Reply-To: <95678797-bd17-ba3f-8a70-a00b4792a258@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> writes:

> On 04/05/2018 04:44 AM, esben.haabendal@gmail.com wrote:
>> From: Esben Haabendal <eha@deif.com>
>> 
>> Read configration settings, to allow automatic forced speed/duplex setup
>> by hardware strapping.
>
> OK but why? What problem is this solving for you?

It is ensuring that the PHY is configured according to the HW design.
If the HW design has set the strap configuration to fx. fixed 100 Mbit
full-duplex, this avoids Linux configuring it for auto-negotiation.

> In general, we do not really want to preserve too much of what the PHY
> has been previously configured with,

Even when this "previous" configuration is the configuration selected by
the board configuration?

> provided that the PHY driver can re-instate these configuration
> values.

This is sort of what I try to do here.  The PHY driver needs to check
the BMCR register to figure out what the strapped configuration was.
Without that, it is necessary for software configuration to duplicate
the exact same configuration as is encoded in the strap configuration in
order for the PHY to be configured as it is supposed to.

> I just wonder how this can be robust when you connect this PHY with
> auto-negotiation disabled to a peer that expects a set of link
> parameters not covered by the default advertisement values?

I assume it will fail just as it will if you use ethtool to configure
the PHY wrongly.

> This really looks like a recipe for disaster when you could just
> disable auto-negotiation with ethtool.

The current PHY driver approach to always default to enable
auto-negotation, and then allow user-space to configure it differently
with ethtool.

With this patch, the default configuration is chosen based on the strap
configuration, but user-space can still change the configuration with
ethtool if needed / desired.

I don't think it is such a big change.

Bringing up the PHY with a default configuration according to HW
strapping instead of an in-kernel SW hard-coded value sounds like a good
idea to me.

/Esben

^ permalink raw reply

* Re: [PATCH 1/2] net: phy: Helper function for reading strapped configuration values
From: Esben Haabendal @ 2018-04-05 20:34 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Andrew Lunn, open list:ETHERNET PHY LIBRARY, open list,
	Rasmus Villemoes
In-Reply-To: <49448b00-e3fd-25df-580e-c02428f1bfe6@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> writes:

> On 04/05/2018 04:44 AM, esben.haabendal@gmail.com wrote:
>> From: Esben Haabendal <eha@deif.com>
>> 
>> Add a function for use in PHY driver probe functions, reading current
>> autoneg, speed and duplex configuration from BMCR register.
>> 
>> Useful for PHY that supports hardware strapped configuration, enabling
>> Linux to respect that configuration (i.e. strapped non-autoneg
>> configuration).
>> 
>> Signed-off-by: Esben Haabendal <eha@deif.com>
>> Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
>> ---
>>  drivers/net/phy/phy_device.c | 41 +++++++++++++++++++++++++++++++++++++++++
>>  include/linux/phy.h          |  1 +
>>  2 files changed, 42 insertions(+)
>> 
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 74664a6c0cdc..cc52ff2a2344 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -1673,6 +1673,47 @@ int genphy_config_init(struct phy_device *phydev)
>>  }
>>  EXPORT_SYMBOL(genphy_config_init);
>>  
>> +/**
>> + * genphy_read_config - read configuration from PHY
>> + * @phydev: target phy_device struct
>> + *
>> + * Description: Reads MII_BMCR and sets phydev autoneg, speed and duplex
>> + * accordingly.  For use in driver probe functions, to respect strapped
>> + * configuration settings.
>> + */
>> +int genphy_read_config(struct phy_device *phydev)
>
> This duplicates what already exists, in part at least within
> genphy_read_status() can you find a way to use that?

Make a small static function for updating duplex and speed fields from a
BMCR value.  It will not be big re-use, but it would make sense.  I will
do that in next patch version.

/Esben

^ permalink raw reply

* Re: [PATCH 2/2] net: phy: dp83640: Read strapped configuration settings
From: Esben Haabendal @ 2018-04-05 20:30 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Richard Cochran, Andrew Lunn,
	open list:PTP HARDWARE CLOCK SUPPORT, open list, Rasmus Villemoes
In-Reply-To: <95678797-bd17-ba3f-8a70-a00b4792a258@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> writes:

> On 04/05/2018 04:44 AM, esben.haabendal@gmail.com wrote:
>> From: Esben Haabendal <eha@deif.com>
>> 
>> Read configration settings, to allow automatic forced speed/duplex setup
>> by hardware strapping.
>
> OK but why? What problem is this solving for you?

It is ensuring that the PHY is configured according to the HW design.
If the HW design has set the strap configuration to fx. fixed 100 Mbit
full-duplex, this avoids Linux configuring it for auto-negotiation.

> In general, we do not really want to preserve too much of what the PHY
> has been previously configured with,

Even when this "previous" configuration is the configuration selected by
the board configuration?

> provided that the PHY driver can re-instate these configuration
> values.

This is sort of what I try to do here.  The PHY driver needs to check
the BMCR register to figure out what the strapped configuration was.
Without that, it is necessary for software configuration to duplicate
the exact same configuration as is encoded in the strap configuration in
order for the PHY to be configured as it is supposed to.

> I just wonder how this can be robust when you connect this PHY with
> auto-negotiation disabled to a peer that expects a set of link
> parameters not covered by the default advertisement values?

I assume it will fail just as it will if you use ethtool to configure
the PHY wrongly.

> This really looks like a recipe for disaster when you could just
> disable auto-negotiation with ethtool.

The current PHY driver approach to always default to enable
auto-negotation, and then allow user-space to configure it differently
with ethtool.

With this patch, the default configuration is chosen based on the strap
configuration, but user-space can still change the configuration with
ethtool if needed / desired.

I don't think it is such a big change.

Bringing up the PHY with a default configuration according to HW
strapping instead of an in-kernel SW hard-coded value sounds like a good
idea to me.

/Esben

^ permalink raw reply

* Re: Enable and configure storm prevention in a network device
From: David Miller @ 2018-04-05 20:20 UTC (permalink / raw)
  To: m-karicheri2; +Cc: netdev
In-Reply-To: <76c2b7c7-4f85-c81e-c928-e5650cc26b93@ti.com>

From: Murali Karicheri <m-karicheri2@ti.com>
Date: Thu, 5 Apr 2018 16:14:49 -0400

> Is there a standard way to implement and configure storm prevention
> in a Linux network device?

What kind of "storm", an interrupt storm?

^ permalink raw reply

* Re: [PATCH 1/2] net: phy: Helper function for reading strapped configuration values
From: Esben Haabendal @ 2018-04-05 20:18 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Andrew Lunn, open list:ETHERNET PHY LIBRARY, open list,
	Rasmus Villemoes
In-Reply-To: <49448b00-e3fd-25df-580e-c02428f1bfe6@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> writes:

> On 04/05/2018 04:44 AM, esben.haabendal@gmail.com wrote:
>> From: Esben Haabendal <eha@deif.com>
>> 
>> Add a function for use in PHY driver probe functions, reading current
>> autoneg, speed and duplex configuration from BMCR register.
>> 
>> Useful for PHY that supports hardware strapped configuration, enabling
>> Linux to respect that configuration (i.e. strapped non-autoneg
>> configuration).
>> 
>> Signed-off-by: Esben Haabendal <eha@deif.com>
>> Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
>> ---
>>  drivers/net/phy/phy_device.c | 41 +++++++++++++++++++++++++++++++++++++++++
>>  include/linux/phy.h          |  1 +
>>  2 files changed, 42 insertions(+)
>> 
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 74664a6c0cdc..cc52ff2a2344 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -1673,6 +1673,47 @@ int genphy_config_init(struct phy_device *phydev)
>>  }
>>  EXPORT_SYMBOL(genphy_config_init);
>>  
>> +/**
>> + * genphy_read_config - read configuration from PHY
>> + * @phydev: target phy_device struct
>> + *
>> + * Description: Reads MII_BMCR and sets phydev autoneg, speed and duplex
>> + * accordingly.  For use in driver probe functions, to respect strapped
>> + * configuration settings.
>> + */
>> +int genphy_read_config(struct phy_device *phydev)
>
> This duplicates what already exists, in part at least within
> genphy_read_status() can you find a way to use that?

Make a small static function for updating duplex and speed fields from a
BMCR value.  It will not be big re-use, but it would make sense.  I will
do that in next patch version.

/Esben

^ permalink raw reply

* [patch net] devlink: convert occ_get op to separate registration
From: Jiri Pirko @ 2018-04-05 20:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, jakub.kicinski, dsahern, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

This resolves race during initialization where the resources with
ops are registered before driver and the structures used by occ_get
op is initialized. So keep occ_get callbacks registered only when
all structs are initialized.

The example flows, as it is in mlxsw:
1) driver load/asic probe:
   mlxsw_core
      -> mlxsw_sp_resources_register
        -> mlxsw_sp_kvdl_resources_register
          -> devlink_resource_register IDX
   mlxsw_spectrum
      -> mlxsw_sp_kvdl_init
        -> mlxsw_sp_kvdl_parts_init
          -> mlxsw_sp_kvdl_part_init
            -> devlink_resource_size_get IDX (to get the current setup
                                              size from devlink)
        -> devlink_resource_occ_get_register IDX (register current
                                                  occupancy getter)
2) reload triggered by devlink command:
  -> mlxsw_devlink_core_bus_device_reload
    -> mlxsw_sp_fini
      -> mlxsw_sp_kvdl_fini
	-> devlink_resource_occ_get_unregister IDX
    (struct mlxsw_sp *mlxsw_sp is freed at this point, call to occ get
     which is using mlxsw_sp would cause use-after free)
    -> mlxsw_sp_init
      -> mlxsw_sp_kvdl_init
        -> mlxsw_sp_kvdl_parts_init
          -> mlxsw_sp_kvdl_part_init
            -> devlink_resource_size_get IDX (to get the current setup
                                              size from devlink)
        -> devlink_resource_occ_get_register IDX (register current
                                                  occupancy getter)

Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- rebased on top or netdevsim changes
- improved description
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 24 ++-----
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  1 -
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    | 67 ++++++++++++--------
 drivers/net/netdevsim/devlink.c                    | 65 +++++++++----------
 include/net/devlink.h                              | 40 ++++++++----
 net/core/devlink.c                                 | 74 +++++++++++++++++++---
 6 files changed, 165 insertions(+), 106 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 53fffd09d133..ca38a30fbe91 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3805,18 +3805,6 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
 	},
 };
 
-static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
-{
-	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
-
-	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
-}
-
-static const struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
-	.occ_get = mlxsw_sp_resource_kvd_linear_occ_get,
-};
-
 static void
 mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core,
 				      struct devlink_resource_size_params *kvd_size_params,
@@ -3877,8 +3865,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
 					kvd_size, MLXSW_SP_RESOURCE_KVD,
 					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&kvd_size_params,
-					NULL);
+					&kvd_size_params);
 	if (err)
 		return err;
 
@@ -3887,8 +3874,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 					linear_size,
 					MLXSW_SP_RESOURCE_KVD_LINEAR,
 					MLXSW_SP_RESOURCE_KVD,
-					&linear_size_params,
-					&mlxsw_sp_resource_kvd_linear_ops);
+					&linear_size_params);
 	if (err)
 		return err;
 
@@ -3905,8 +3891,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 					double_size,
 					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
 					MLXSW_SP_RESOURCE_KVD,
-					&hash_double_size_params,
-					NULL);
+					&hash_double_size_params);
 	if (err)
 		return err;
 
@@ -3915,8 +3900,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 					single_size,
 					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
 					MLXSW_SP_RESOURCE_KVD,
-					&hash_single_size_params,
-					NULL);
+					&hash_single_size_params);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 82820ba43728..804d4d2c8031 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -442,7 +442,6 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
 int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
 				   unsigned int entry_count,
 				   unsigned int *p_alloc_size);
-u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp);
 int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core);
 
 struct mlxsw_sp_acl_rule_info {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 8796db44dcc3..fe4327f547d2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -315,8 +315,9 @@ static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
 	return occ;
 }
 
-u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
+static u64 mlxsw_sp_kvdl_occ_get(void *priv)
 {
+	const struct mlxsw_sp *mlxsw_sp = priv;
 	u64 occ = 0;
 	int i;
 
@@ -326,48 +327,33 @@ u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
 	return occ;
 }
 
-static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
+static u64 mlxsw_sp_kvdl_single_occ_get(void *priv)
 {
-	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	const struct mlxsw_sp *mlxsw_sp = priv;
 	struct mlxsw_sp_kvdl_part *part;
 
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE];
 	return mlxsw_sp_kvdl_part_occ(part);
 }
 
-static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
+static u64 mlxsw_sp_kvdl_chunks_occ_get(void *priv)
 {
-	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	const struct mlxsw_sp *mlxsw_sp = priv;
 	struct mlxsw_sp_kvdl_part *part;
 
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS];
 	return mlxsw_sp_kvdl_part_occ(part);
 }
 
-static u64 mlxsw_sp_kvdl_large_chunks_occ_get(struct devlink *devlink)
+static u64 mlxsw_sp_kvdl_large_chunks_occ_get(void *priv)
 {
-	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	const struct mlxsw_sp *mlxsw_sp = priv;
 	struct mlxsw_sp_kvdl_part *part;
 
 	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS];
 	return mlxsw_sp_kvdl_part_occ(part);
 }
 
-static const struct devlink_resource_ops mlxsw_sp_kvdl_single_ops = {
-	.occ_get = mlxsw_sp_kvdl_single_occ_get,
-};
-
-static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_ops = {
-	.occ_get = mlxsw_sp_kvdl_chunks_occ_get,
-};
-
-static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_large_ops = {
-	.occ_get = mlxsw_sp_kvdl_large_chunks_occ_get,
-};
-
 int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
@@ -386,8 +372,7 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
 					MLXSW_SP_KVDL_SINGLE_SIZE,
 					MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
 					MLXSW_SP_RESOURCE_KVD_LINEAR,
-					&size_params,
-					&mlxsw_sp_kvdl_single_ops);
+					&size_params);
 	if (err)
 		return err;
 
@@ -398,8 +383,7 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
 					MLXSW_SP_KVDL_CHUNKS_SIZE,
 					MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
 					MLXSW_SP_RESOURCE_KVD_LINEAR,
-					&size_params,
-					&mlxsw_sp_kvdl_chunks_ops);
+					&size_params);
 	if (err)
 		return err;
 
@@ -410,13 +394,13 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
 					MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE,
 					MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
 					MLXSW_SP_RESOURCE_KVD_LINEAR,
-					&size_params,
-					&mlxsw_sp_kvdl_chunks_large_ops);
+					&size_params);
 	return err;
 }
 
 int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
 {
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
 	struct mlxsw_sp_kvdl *kvdl;
 	int err;
 
@@ -429,6 +413,23 @@ int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
 	if (err)
 		goto err_kvdl_parts_init;
 
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR,
+					  mlxsw_sp_kvdl_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+					  mlxsw_sp_kvdl_single_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+					  mlxsw_sp_kvdl_chunks_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+					  mlxsw_sp_kvdl_large_chunks_occ_get,
+					  mlxsw_sp);
+
 	return 0;
 
 err_kvdl_parts_init:
@@ -438,6 +439,16 @@ int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
 
 void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp)
 {
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR);
 	mlxsw_sp_kvdl_parts_fini(mlxsw_sp);
 	kfree(mlxsw_sp->kvdl);
 }
diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c
index 1dba47936456..bef7db5d129a 100644
--- a/drivers/net/netdevsim/devlink.c
+++ b/drivers/net/netdevsim/devlink.c
@@ -30,52 +30,36 @@ static struct net *nsim_devlink_net(struct devlink *devlink)
 
 /* IPv4
  */
-static u64 nsim_ipv4_fib_resource_occ_get(struct devlink *devlink)
+static u64 nsim_ipv4_fib_resource_occ_get(void *priv)
 {
-	struct net *net = nsim_devlink_net(devlink);
+	struct net *net = priv;
 
 	return nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB, false);
 }
 
-static struct devlink_resource_ops nsim_ipv4_fib_res_ops = {
-	.occ_get = nsim_ipv4_fib_resource_occ_get,
-};
-
-static u64 nsim_ipv4_fib_rules_res_occ_get(struct devlink *devlink)
+static u64 nsim_ipv4_fib_rules_res_occ_get(void *priv)
 {
-	struct net *net = nsim_devlink_net(devlink);
+	struct net *net = priv;
 
 	return nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB_RULES, false);
 }
 
-static struct devlink_resource_ops nsim_ipv4_fib_rules_res_ops = {
-	.occ_get = nsim_ipv4_fib_rules_res_occ_get,
-};
-
 /* IPv6
  */
-static u64 nsim_ipv6_fib_resource_occ_get(struct devlink *devlink)
+static u64 nsim_ipv6_fib_resource_occ_get(void *priv)
 {
-	struct net *net = nsim_devlink_net(devlink);
+	struct net *net = priv;
 
 	return nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB, false);
 }
 
-static struct devlink_resource_ops nsim_ipv6_fib_res_ops = {
-	.occ_get = nsim_ipv6_fib_resource_occ_get,
-};
-
-static u64 nsim_ipv6_fib_rules_res_occ_get(struct devlink *devlink)
+static u64 nsim_ipv6_fib_rules_res_occ_get(void *priv)
 {
-	struct net *net = nsim_devlink_net(devlink);
+	struct net *net = priv;
 
 	return nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB_RULES, false);
 }
 
-static struct devlink_resource_ops nsim_ipv6_fib_rules_res_ops = {
-	.occ_get = nsim_ipv6_fib_rules_res_occ_get,
-};
-
 static int devlink_resources_register(struct devlink *devlink)
 {
 	struct devlink_resource_size_params params = {
@@ -91,7 +75,7 @@ static int devlink_resources_register(struct devlink *devlink)
 	err = devlink_resource_register(devlink, "IPv4", (u64)-1,
 					NSIM_RESOURCE_IPV4,
 					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&params, NULL);
+					&params);
 	if (err) {
 		pr_err("Failed to register IPv4 top resource\n");
 		goto out;
@@ -100,8 +84,7 @@ static int devlink_resources_register(struct devlink *devlink)
 	n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB, true);
 	err = devlink_resource_register(devlink, "fib", n,
 					NSIM_RESOURCE_IPV4_FIB,
-					NSIM_RESOURCE_IPV4,
-					&params, &nsim_ipv4_fib_res_ops);
+					NSIM_RESOURCE_IPV4, &params);
 	if (err) {
 		pr_err("Failed to register IPv4 FIB resource\n");
 		return err;
@@ -110,8 +93,7 @@ static int devlink_resources_register(struct devlink *devlink)
 	n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB_RULES, true);
 	err = devlink_resource_register(devlink, "fib-rules", n,
 					NSIM_RESOURCE_IPV4_FIB_RULES,
-					NSIM_RESOURCE_IPV4,
-					&params, &nsim_ipv4_fib_rules_res_ops);
+					NSIM_RESOURCE_IPV4, &params);
 	if (err) {
 		pr_err("Failed to register IPv4 FIB rules resource\n");
 		return err;
@@ -121,7 +103,7 @@ static int devlink_resources_register(struct devlink *devlink)
 	err = devlink_resource_register(devlink, "IPv6", (u64)-1,
 					NSIM_RESOURCE_IPV6,
 					DEVLINK_RESOURCE_ID_PARENT_TOP,
-					&params, NULL);
+					&params);
 	if (err) {
 		pr_err("Failed to register IPv6 top resource\n");
 		goto out;
@@ -130,8 +112,7 @@ static int devlink_resources_register(struct devlink *devlink)
 	n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB, true);
 	err = devlink_resource_register(devlink, "fib", n,
 					NSIM_RESOURCE_IPV6_FIB,
-					NSIM_RESOURCE_IPV6,
-					&params, &nsim_ipv6_fib_res_ops);
+					NSIM_RESOURCE_IPV6, &params);
 	if (err) {
 		pr_err("Failed to register IPv6 FIB resource\n");
 		return err;
@@ -140,12 +121,28 @@ static int devlink_resources_register(struct devlink *devlink)
 	n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB_RULES, true);
 	err = devlink_resource_register(devlink, "fib-rules", n,
 					NSIM_RESOURCE_IPV6_FIB_RULES,
-					NSIM_RESOURCE_IPV6,
-					&params, &nsim_ipv6_fib_rules_res_ops);
+					NSIM_RESOURCE_IPV6, &params);
 	if (err) {
 		pr_err("Failed to register IPv6 FIB rules resource\n");
 		return err;
 	}
+
+	devlink_resource_occ_get_register(devlink,
+					  NSIM_RESOURCE_IPV4_FIB,
+					  nsim_ipv4_fib_resource_occ_get,
+					  net);
+	devlink_resource_occ_get_register(devlink,
+					  NSIM_RESOURCE_IPV4_FIB_RULES,
+					  nsim_ipv4_fib_rules_res_occ_get,
+					  net);
+	devlink_resource_occ_get_register(devlink,
+					  NSIM_RESOURCE_IPV6_FIB,
+					  nsim_ipv6_fib_resource_occ_get,
+					  net);
+	devlink_resource_occ_get_register(devlink,
+					  NSIM_RESOURCE_IPV6_FIB_RULES,
+					  nsim_ipv6_fib_rules_res_occ_get,
+					  net);
 out:
 	return err;
 }
diff --git a/include/net/devlink.h b/include/net/devlink.h
index e21d8cadd480..2e4f71e16e95 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -231,14 +231,6 @@ struct devlink_dpipe_headers {
 	unsigned int headers_count;
 };
 
-/**
- * struct devlink_resource_ops - resource ops
- * @occ_get: get the occupied size
- */
-struct devlink_resource_ops {
-	u64 (*occ_get)(struct devlink *devlink);
-};
-
 /**
  * struct devlink_resource_size_params - resource's size parameters
  * @size_min: minimum size which can be set
@@ -265,6 +257,8 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para
 	size_params->unit = unit;
 }
 
+typedef u64 devlink_resource_occ_get_t(void *priv);
+
 /**
  * struct devlink_resource - devlink resource
  * @name: name of the resource
@@ -277,7 +271,6 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para
  * @size_params: size parameters
  * @list: parent list
  * @resource_list: list of child resources
- * @resource_ops: resource ops
  */
 struct devlink_resource {
 	const char *name;
@@ -289,7 +282,8 @@ struct devlink_resource {
 	struct devlink_resource_size_params size_params;
 	struct list_head list;
 	struct list_head resource_list;
-	const struct devlink_resource_ops *resource_ops;
+	devlink_resource_occ_get_t *occ_get;
+	void *occ_get_priv;
 };
 
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
@@ -409,8 +403,7 @@ int devlink_resource_register(struct devlink *devlink,
 			      u64 resource_size,
 			      u64 resource_id,
 			      u64 parent_resource_id,
-			      const struct devlink_resource_size_params *size_params,
-			      const struct devlink_resource_ops *resource_ops);
+			      const struct devlink_resource_size_params *size_params);
 void devlink_resources_unregister(struct devlink *devlink,
 				  struct devlink_resource *resource);
 int devlink_resource_size_get(struct devlink *devlink,
@@ -419,6 +412,12 @@ int devlink_resource_size_get(struct devlink *devlink,
 int devlink_dpipe_table_resource_set(struct devlink *devlink,
 				     const char *table_name, u64 resource_id,
 				     u64 resource_units);
+void devlink_resource_occ_get_register(struct devlink *devlink,
+				       u64 resource_id,
+				       devlink_resource_occ_get_t *occ_get,
+				       void *occ_get_priv);
+void devlink_resource_occ_get_unregister(struct devlink *devlink,
+					 u64 resource_id);
 
 #else
 
@@ -562,8 +561,7 @@ devlink_resource_register(struct devlink *devlink,
 			  u64 resource_size,
 			  u64 resource_id,
 			  u64 parent_resource_id,
-			  const struct devlink_resource_size_params *size_params,
-			  const struct devlink_resource_ops *resource_ops)
+			  const struct devlink_resource_size_params *size_params)
 {
 	return 0;
 }
@@ -589,6 +587,20 @@ devlink_dpipe_table_resource_set(struct devlink *devlink,
 	return -EOPNOTSUPP;
 }
 
+static inline void
+devlink_resource_occ_get_register(struct devlink *devlink,
+				  u64 resource_id,
+				  devlink_resource_occ_get_t *occ_get,
+				  void *occ_get_priv)
+{
+}
+
+static inline void
+devlink_resource_occ_get_unregister(struct devlink *devlink,
+				    u64 resource_id)
+{
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9236e421bd62..ad1317376798 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2405,6 +2405,16 @@ devlink_resource_size_params_put(struct devlink_resource *resource,
 	return 0;
 }
 
+static int devlink_resource_occ_put(struct devlink_resource *resource,
+				    struct sk_buff *skb)
+{
+	if (!resource->occ_get)
+		return 0;
+	return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				 resource->occ_get(resource->occ_get_priv),
+				 DEVLINK_ATTR_PAD);
+}
+
 static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
 				struct devlink_resource *resource)
 {
@@ -2425,11 +2435,8 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
 	if (resource->size != resource->size_new)
 		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
 				  resource->size_new, DEVLINK_ATTR_PAD);
-	if (resource->resource_ops && resource->resource_ops->occ_get)
-		if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
-				      resource->resource_ops->occ_get(devlink),
-				      DEVLINK_ATTR_PAD))
-			goto nla_put_failure;
+	if (devlink_resource_occ_put(resource, skb))
+		goto nla_put_failure;
 	if (devlink_resource_size_params_put(resource, skb))
 		goto nla_put_failure;
 	if (list_empty(&resource->resource_list))
@@ -3162,15 +3169,13 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
  *	@resource_id: resource's id
  *	@parent_reosurce_id: resource's parent id
  *	@size params: size parameters
- *	@resource_ops: resource ops
  */
 int devlink_resource_register(struct devlink *devlink,
 			      const char *resource_name,
 			      u64 resource_size,
 			      u64 resource_id,
 			      u64 parent_resource_id,
-			      const struct devlink_resource_size_params *size_params,
-			      const struct devlink_resource_ops *resource_ops)
+			      const struct devlink_resource_size_params *size_params)
 {
 	struct devlink_resource *resource;
 	struct list_head *resource_list;
@@ -3213,7 +3218,6 @@ int devlink_resource_register(struct devlink *devlink,
 	resource->size = resource_size;
 	resource->size_new = resource_size;
 	resource->id = resource_id;
-	resource->resource_ops = resource_ops;
 	resource->size_valid = true;
 	memcpy(&resource->size_params, size_params,
 	       sizeof(resource->size_params));
@@ -3315,6 +3319,58 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
 
+/**
+ *	devlink_resource_occ_get_register - register occupancy getter
+ *
+ *	@devlink: devlink
+ *	@resource_id: resource id
+ *	@occ_get: occupancy getter callback
+ *	@occ_get_priv: occupancy getter callback priv
+ */
+void devlink_resource_occ_get_register(struct devlink *devlink,
+				       u64 resource_id,
+				       devlink_resource_occ_get_t *occ_get,
+				       void *occ_get_priv)
+{
+	struct devlink_resource *resource;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (WARN_ON(!resource))
+		goto out;
+	WARN_ON(resource->occ_get);
+
+	resource->occ_get = occ_get;
+	resource->occ_get_priv = occ_get_priv;
+out:
+	mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
+
+/**
+ *	devlink_resource_occ_get_unregister - unregister occupancy getter
+ *
+ *	@devlink: devlink
+ *	@resource_id: resource id
+ */
+void devlink_resource_occ_get_unregister(struct devlink *devlink,
+					 u64 resource_id)
+{
+	struct devlink_resource *resource;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (WARN_ON(!resource))
+		goto out;
+	WARN_ON(!resource->occ_get);
+
+	resource->occ_get = NULL;
+	resource->occ_get_priv = NULL;
+out:
+	mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
2.14.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox