* [PATCH v2 net-next 3/3] mlxsw: Add extack messages for port_{un,}split failures
From: dsahern @ 2018-06-05 15:14 UTC (permalink / raw)
To: netdev; +Cc: idosch, jiri, jakub.kicinski, David Ahern
In-Reply-To: <20180605151411.20310-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Return messages in extack for port split/unsplit errors. e.g.,
$ devlink port split swp1s1 count 4
Error: mlxsw_spectrum: Port cannot be split further.
devlink answers: Invalid argument
$ devlink port unsplit swp4
Error: mlxsw_spectrum: Port was not split.
devlink answers: Invalid argument
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
drivers/net/ethernet/mellanox/mlxsw/core.c | 14 ++++++++++----
drivers/net/ethernet/mellanox/mlxsw/core.h | 5 +++--
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 15 ++++++++++++---
3 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 7ed38d80bc08..f9c724752a32 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -775,11 +775,14 @@ static int mlxsw_devlink_port_split(struct devlink *devlink,
{
struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
- if (port_index >= mlxsw_core->max_ports)
+ if (port_index >= mlxsw_core->max_ports) {
+ NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
return -EINVAL;
+ }
if (!mlxsw_core->driver->port_split)
return -EOPNOTSUPP;
- return mlxsw_core->driver->port_split(mlxsw_core, port_index, count);
+ return mlxsw_core->driver->port_split(mlxsw_core, port_index, count,
+ extack);
}
static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
@@ -788,11 +791,14 @@ static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
{
struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
- if (port_index >= mlxsw_core->max_ports)
+ if (port_index >= mlxsw_core->max_ports) {
+ NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
return -EINVAL;
+ }
if (!mlxsw_core->driver->port_unsplit)
return -EOPNOTSUPP;
- return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index);
+ return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index,
+ extack);
}
static int
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 4a8d4c7f89d9..552cfa29c2f7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -274,8 +274,9 @@ struct mlxsw_driver {
int (*port_type_set)(struct mlxsw_core *mlxsw_core, u8 local_port,
enum devlink_port_type new_type);
int (*port_split)(struct mlxsw_core *mlxsw_core, u8 local_port,
- unsigned int count);
- int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port);
+ unsigned int count, struct netlink_ext_ack *extack);
+ int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port,
+ struct netlink_ext_ack *extack);
int (*sb_pool_get)(struct mlxsw_core *mlxsw_core,
unsigned int sb_index, u16 pool_index,
struct devlink_sb_pool_info *pool_info);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index fc39f22e5c70..968b88af2ef5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3092,7 +3092,8 @@ static void mlxsw_sp_port_unsplit_create(struct mlxsw_sp *mlxsw_sp,
}
static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
- unsigned int count)
+ unsigned int count,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
struct mlxsw_sp_port *mlxsw_sp_port;
@@ -3104,6 +3105,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
if (!mlxsw_sp_port) {
dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
local_port);
+ NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
return -EINVAL;
}
@@ -3112,11 +3114,13 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
if (count != 2 && count != 4) {
netdev_err(mlxsw_sp_port->dev, "Port can only be split into 2 or 4 ports\n");
+ NL_SET_ERR_MSG_MOD(extack, "Port can only be split into 2 or 4 ports");
return -EINVAL;
}
if (cur_width != MLXSW_PORT_MODULE_MAX_WIDTH) {
netdev_err(mlxsw_sp_port->dev, "Port cannot be split further\n");
+ NL_SET_ERR_MSG_MOD(extack, "Port cannot be split further");
return -EINVAL;
}
@@ -3125,6 +3129,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
base_port = local_port;
if (mlxsw_sp->ports[base_port + 1]) {
netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+ NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
return -EINVAL;
}
} else {
@@ -3132,6 +3137,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
if (mlxsw_sp->ports[base_port + 1] ||
mlxsw_sp->ports[base_port + 3]) {
netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+ NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
return -EINVAL;
}
}
@@ -3153,7 +3159,8 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
return err;
}
-static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
+static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
struct mlxsw_sp_port *mlxsw_sp_port;
@@ -3165,11 +3172,13 @@ static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
if (!mlxsw_sp_port) {
dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
local_port);
+ NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
return -EINVAL;
}
if (!mlxsw_sp_port->split) {
- netdev_err(mlxsw_sp_port->dev, "Port wasn't split\n");
+ netdev_err(mlxsw_sp_port->dev, "Port was not split\n");
+ NL_SET_ERR_MSG_MOD(extack, "Port was not split");
return -EINVAL;
}
--
2.11.0
^ permalink raw reply related
* [PATCH v2 net-next 2/3] netdevsim: Add extack error message for devlink reload
From: dsahern @ 2018-06-05 15:14 UTC (permalink / raw)
To: netdev; +Cc: idosch, jiri, jakub.kicinski, David Ahern
In-Reply-To: <20180605151411.20310-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
devlink reset command can fail if a FIB resource limit is set to a value
lower than the current occupancy. Return a proper message indicating the
reason for the failure.
$ devlink resource sh netdevsim/netdevsim0
netdevsim/netdevsim0:
name IPv4 size unlimited unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
resources:
name fib size unlimited occ 43 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
name fib-rules size unlimited occ 4 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
name IPv6 size unlimited unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
resources:
name fib size unlimited occ 54 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
name fib-rules size unlimited occ 3 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
$ devlink resource set netdevsim/netdevsim0 path /IPv4/fib size 40
$ devlink dev reload netdevsim/netdevsim0
Error: netdevsim: New size is less than current occupancy.
devlink answers: Invalid argument
Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
drivers/net/netdevsim/devlink.c | 4 ++--
drivers/net/netdevsim/fib.c | 9 ++++++---
drivers/net/netdevsim/netdevsim.h | 3 ++-
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c
index e8366cf372ff..ba663e5af168 100644
--- a/drivers/net/netdevsim/devlink.c
+++ b/drivers/net/netdevsim/devlink.c
@@ -163,7 +163,7 @@ static int nsim_devlink_reload(struct devlink *devlink,
err = devlink_resource_size_get(devlink, res_ids[i], &val);
if (!err) {
- err = nsim_fib_set_max(net, res_ids[i], val);
+ err = nsim_fib_set_max(net, res_ids[i], val, extack);
if (err)
return err;
}
@@ -181,7 +181,7 @@ static void nsim_devlink_net_reset(struct net *net)
int i;
for (i = 0; i < ARRAY_SIZE(res_ids); ++i) {
- if (nsim_fib_set_max(net, res_ids[i], (u64)-1)) {
+ if (nsim_fib_set_max(net, res_ids[i], (u64)-1, NULL)) {
pr_err("Failed to reset limit for resource %u\n",
res_ids[i]);
}
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 9bfe9e151e13..f61d094746c0 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -64,7 +64,8 @@ u64 nsim_fib_get_val(struct net *net, enum nsim_resource_id res_id, bool max)
return max ? entry->max : entry->num;
}
-int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val)
+int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val,
+ struct netlink_ext_ack *extack)
{
struct nsim_fib_data *fib_data = net_generic(net, nsim_fib_net_id);
struct nsim_fib_entry *entry;
@@ -90,10 +91,12 @@ int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val)
/* not allowing a new max to be less than curren occupancy
* --> no means of evicting entries
*/
- if (val < entry->num)
+ if (val < entry->num) {
+ NL_SET_ERR_MSG_MOD(extack, "New size is less than current occupancy");
err = -EINVAL;
- else
+ } else {
entry->max = val;
+ }
return err;
}
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 3a8581af3b85..8ca50b72c328 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -126,7 +126,8 @@ void nsim_devlink_exit(void);
int nsim_fib_init(void);
void nsim_fib_exit(void);
u64 nsim_fib_get_val(struct net *net, enum nsim_resource_id res_id, bool max);
-int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val);
+int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val,
+ struct netlink_ext_ack *extack);
#else
static inline int nsim_devlink_setup(struct netdevsim *ns)
{
--
2.11.0
^ permalink raw reply related
* Re: [PATCH v2 net-next 3/3] mlxsw: Add extack messages for port_{un,}split failures
From: Jiri Pirko @ 2018-06-05 15:15 UTC (permalink / raw)
To: dsahern; +Cc: netdev, idosch, jiri, jakub.kicinski, David Ahern
In-Reply-To: <20180605151411.20310-4-dsahern@kernel.org>
Tue, Jun 05, 2018 at 05:14:11PM CEST, dsahern@kernel.org wrote:
>From: David Ahern <dsahern@gmail.com>
>
>Return messages in extack for port split/unsplit errors. e.g.,
> $ devlink port split swp1s1 count 4
> Error: mlxsw_spectrum: Port cannot be split further.
> devlink answers: Invalid argument
>
> $ devlink port unsplit swp4
> Error: mlxsw_spectrum: Port was not split.
> devlink answers: Invalid argument
>
>Signed-off-by: David Ahern <dsahern@gmail.com>
>Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: [RFC PATCH] kcm: hold rx mux lock when updating the receive queue.
From: Paolo Abeni @ 2018-06-05 16:06 UTC (permalink / raw)
To: Tom Herbert, David Miller
Cc: Linux Kernel Network Developers, Tom Herbert, ktkhai
In-Reply-To: <CALx6S353uk_W8b4ic1NYNBS--z41PT6brkwzPvZZj6J2-yEieg@mail.gmail.com>
Hi,
On Tue, 2018-06-05 at 08:35 -0700, Tom Herbert wrote:
> On Tue, Jun 5, 2018 at 7:53 AM, David Miller <davem@davemloft.net> wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Tue, 5 Jun 2018 12:32:33 +0200
> >
> >> @@ -1157,7 +1158,9 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
> >> /* Finished with message */
> >> msg->msg_flags |= MSG_EOR;
> >> KCM_STATS_INCR(kcm->stats.rx_msgs);
> >> + spin_lock_bh(&kcm->mux->rx_lock);
> >> skb_unlink(skb, &sk->sk_receive_queue);
> >> + spin_unlock_bh(&kcm->mux->rx_lock);
> >
> > Hmmm, maybe I don't understand the corruption.
> >
> > But, skb_unlink() takes the sk->sk_receive_queue.lock which should
> > prevent SKB list corruption.
>
> It looks like there is a case where the list is being manipulated
> without the queue lock. That is in requeue_rx_msgs where
> __skb_dequeue is being called instead of skb_dequeue which is in
> requeue_rx_msgs. requeue_rx_msgs holds the mux rx_lock which would
> explain why the suggested patch avoids the issue.
Yep, I belive this is the correct explanation. Sorry for the noise with
the previous patch, I underlooked the skb_queue lock already in place.
> Paolo, thanks for looking into this! Can you try replacing
> __skb_dequeue in requeue_rx_msgs with skb_dequeue to see if that is
> the fix.
Sure, I'll retrigger the test, and report the result here (or directly
a new patch, should the test be succesful)
Thanks,
Paolo
^ permalink raw reply
* Re: suspicius csum initialization in vmxnet3_rx_csum
From: Paolo Abeni @ 2018-06-05 16:08 UTC (permalink / raw)
To: Ronak Doshi, Neil Horman; +Cc: Guolin Yang, Boon Ang, Louis Luo, netdev
In-Reply-To: <alpine.OSX.2.21.1806011100140.27872@doshir-m01.vmware.com>
Hi,
I'm sorry for the long delay in my answer, I've been travelling.
On Fri, 2018-06-01 at 11:10 -0700, Ronak Doshi wrote:
> On Thu, 31 May 2018, Neil Horman wrote:
> > What packet types will rcd.csum be set for?
> > Neil
>
> I looked thorugh the emulation code and found that rcd.csum is not set.
> For valid v4/v6, TCP/UDP packets the code block above the mentioend "if"
> block will be executed or else it will go through checksum none.
>
> That's why I wanted to know (in previous emails) which ESX build is being
> used while this was tested. The code block under "if (gdesc->rcd.csum)"
> block might seem incorrect but it shouldn't be hit as rcd.csum is not set.
I'm unsure if I read the above correctly. Do you mean that the relevant
code-path is never hit? If so, can we simply drop it, as we agreed that
such code is uncorrect? Elsewhere, could you plese specify under which
circumstances gdesc->rcd.csum is filled by the hypervisor?
> Hence, I asked did the fix provided by Paolo worked for the icmp test?
Unfortunatelly so far I've not been able to reproduce the issue outside
a production environment and I can't run test kernel there.
Thanks,
Paolo
^ permalink raw reply
* [PATCH net-next] rtnetlink: validate attributes in do_setlink()
From: Eric Dumazet @ 2018-06-05 16:25 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Dmitry Vyukov
It seems that rtnl_group_changelink() can call do_setlink
while a prior call to validate_linkmsg(dev = NULL, ...) could
not validate IFLA_ADDRESS / IFLA_BROADCAST
Make sure do_setlink() calls validate_linkmsg() instead
of letting its callers having this responsibility.
With help from Dmitry Vyukov, thanks a lot !
BUG: KMSAN: uninit-value in is_valid_ether_addr include/linux/etherdevice.h:199 [inline]
BUG: KMSAN: uninit-value in eth_prepare_mac_addr_change net/ethernet/eth.c:275 [inline]
BUG: KMSAN: uninit-value in eth_mac_addr+0x203/0x2b0 net/ethernet/eth.c:308
CPU: 1 PID: 8695 Comm: syz-executor3 Not tainted 4.17.0-rc5+ #103
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:113
kmsan_report+0x149/0x260 mm/kmsan/kmsan.c:1084
__msan_warning_32+0x6e/0xc0 mm/kmsan/kmsan_instr.c:686
is_valid_ether_addr include/linux/etherdevice.h:199 [inline]
eth_prepare_mac_addr_change net/ethernet/eth.c:275 [inline]
eth_mac_addr+0x203/0x2b0 net/ethernet/eth.c:308
dev_set_mac_address+0x261/0x530 net/core/dev.c:7157
do_setlink+0xbc3/0x5fc0 net/core/rtnetlink.c:2317
rtnl_group_changelink net/core/rtnetlink.c:2824 [inline]
rtnl_newlink+0x1fe9/0x37a0 net/core/rtnetlink.c:2976
rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
sock_sendmsg_nosec net/socket.c:629 [inline]
sock_sendmsg net/socket.c:639 [inline]
___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
__sys_sendmsg net/socket.c:2155 [inline]
__do_sys_sendmsg net/socket.c:2164 [inline]
__se_sys_sendmsg net/socket.c:2162 [inline]
__x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x455a09
RSP: 002b:00007fc07480ec68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fc07480f6d4 RCX: 0000000000455a09
RDX: 0000000000000000 RSI: 00000000200003c0 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 00000000000005d0 R14: 00000000006fdc20 R15: 0000000000000000
Uninit was stored to memory at:
kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
kmsan_save_stack mm/kmsan/kmsan.c:294 [inline]
kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:685
kmsan_memcpy_origins+0x11d/0x170 mm/kmsan/kmsan.c:527
__msan_memcpy+0x109/0x160 mm/kmsan/kmsan_instr.c:478
do_setlink+0xb84/0x5fc0 net/core/rtnetlink.c:2315
rtnl_group_changelink net/core/rtnetlink.c:2824 [inline]
rtnl_newlink+0x1fe9/0x37a0 net/core/rtnetlink.c:2976
rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
sock_sendmsg_nosec net/socket.c:629 [inline]
sock_sendmsg net/socket.c:639 [inline]
___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
__sys_sendmsg net/socket.c:2155 [inline]
__do_sys_sendmsg net/socket.c:2164 [inline]
__se_sys_sendmsg net/socket.c:2162 [inline]
__x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Uninit was created at:
kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:189
kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:315
kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan.c:322
slab_post_alloc_hook mm/slab.h:446 [inline]
slab_alloc_node mm/slub.c:2753 [inline]
__kmalloc_node_track_caller+0xb32/0x11b0 mm/slub.c:4395
__kmalloc_reserve net/core/skbuff.c:138 [inline]
__alloc_skb+0x2cb/0x9e0 net/core/skbuff.c:206
alloc_skb include/linux/skbuff.h:988 [inline]
netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline]
netlink_sendmsg+0x76e/0x1350 net/netlink/af_netlink.c:1876
sock_sendmsg_nosec net/socket.c:629 [inline]
sock_sendmsg net/socket.c:639 [inline]
___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
__sys_sendmsg net/socket.c:2155 [inline]
__do_sys_sendmsg net/socket.c:2164 [inline]
__se_sys_sendmsg net/socket.c:2162 [inline]
__x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: e7ed828f10bd ("netlink: support setting devgroup parameters")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
---
net/core/rtnetlink.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9a1ba2015ad8901680cfc58f95cf6ec525413566..5ef61222fdef1f305909eeca6ac278bcac88e1b0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2266,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
const struct net_device_ops *ops = dev->netdev_ops;
int err;
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
@@ -2629,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = validate_linkmsg(dev, tb);
- if (err < 0)
- goto errout;
-
err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
errout:
return err;
--
2.17.1.1185.g55be947832-goog
^ permalink raw reply related
* Re: [PATCH net v2 1/2] ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds
From: David Miller @ 2018-06-05 16:30 UTC (permalink / raw)
To: sd; +Cc: netdev, edumazet, nikolay, yuvalm, ivecera
In-Reply-To: <604985fb55d51eef9130bff0640a62d5015f25bd.1528194845.git.sd@queasysnail.net>
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 5 Jun 2018 15:01:59 +0200
> Currently, raw6_sk(sk)->ip6mr_table is set unconditionally during
> ip6_mroute_setsockopt(MRT6_TABLE). A subsequent attempt at the same
> setsockopt will fail with -ENOENT, since we haven't actually created
> that table.
>
> A similar fix for ipv4 was included in commit 5e1859fbcc3c ("ipv4: ipmr:
> various fixes and cleanups").
>
> Fixes: d1db275dd3f6 ("ipv6: ip6mr: support multiple tables")
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH net v2 2/2] ipmr: fix error path when ipmr_new_table fails
From: David Miller @ 2018-06-05 16:31 UTC (permalink / raw)
To: sd; +Cc: netdev, edumazet, nikolay, yuvalm, ivecera
In-Reply-To: <572e1baf89c76fafb45a97a724c3e838e5dd4abf.1528194845.git.sd@queasysnail.net>
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 5 Jun 2018 15:02:00 +0200
> commit 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
> refactored ipmr_new_table, so that it now returns NULL when
> mr_table_alloc fails. Unfortunately, all callers of ipmr_new_table
> expect an ERR_PTR.
>
> This can result in NULL deref, for example when ipmr_rules_exit calls
> ipmr_free_table with NULL net->ipv4.mrt in the
> !CONFIG_IP_MROUTE_MULTIPLE_TABLES version.
>
> This patch makes mr_table_alloc return errors, and changes
> ip6mr_new_table and its callers to return/expect error pointers as
> well. It also removes the version of mr_table_alloc defined under
> !CONFIG_IP_MROUTE_COMMON, since it is never used.
>
> Fixes: 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> ---
> v2: - fixed brainfart that shadowed mrt variable in ip6_mroute_setsockopt
> - rebased on top of ip6_mroute_setsockopt fix
Applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH v2 net-next] net: metrics: add proper netlink validation
From: David Miller @ 2018-06-05 16:30 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet, dsahern
In-Reply-To: <20180605130619.150153-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Jun 2018 06:06:19 -0700
> Before using nla_get_u32(), better make sure the attribute
> is of the proper size.
>
> Code recently was changed, but bug has been there from beginning
> of git.
...
> Fixes: a919525ad832 ("net: Move fib_convert_metrics to metrics file")
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Cc: David Ahern <dsahern@gmail.com>
> ---
> v2: fixed a typo.
Applied and queued up for -stable, thanks Eric.
^ permalink raw reply
* Re: [PATCH v2 net-next 0/3] devlink: Add extack messages for reload and port split/unsplit
From: David Miller @ 2018-06-05 16:32 UTC (permalink / raw)
To: dsahern; +Cc: netdev, idosch, jiri, jakub.kicinski, dsahern
In-Reply-To: <20180605151411.20310-1-dsahern@kernel.org>
From: dsahern@kernel.org
Date: Tue, 5 Jun 2018 08:14:08 -0700
> From: David Ahern <dsahern@gmail.com>
>
> Patch 1 adds extack arg to reload, port_split and port_unsplit devlink
> operations.
>
> Patch 2 adds extack messages for reload operation in netdevsim.
>
> Patch 3 adds extack messages to port split/unsplit in mlxsw driver.
>
> v2
> - make the extack messages align with existing dev_err
Series applied, thanks David.
^ permalink raw reply
* Re: [net-next PATCH v3 0/5] Symmetric queue selection using XPS for Rx queues
From: David Miller @ 2018-06-05 16:33 UTC (permalink / raw)
To: amritha.nambiar
Cc: netdev, alexander.h.duyck, willemdebruijn.kernel,
sridhar.samudrala, edumazet, hannes, tom
In-Reply-To: <152818727065.20862.10108275498797168689.stgit@anamdev.jf.intel.com>
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Tue, 05 Jun 2018 01:37:45 -0700
> This patch series implements support for Tx queue selection based on
> Rx queue(s) map. This is done by configuring Rx queue(s) map per Tx-queue
> using sysfs attribute. If the user configuration for Rx queues does
> not apply, then the Tx queue selection falls back to XPS using CPUs and
> finally to hashing.
>
> XPS is refactored to support Tx queue selection based on either the
> CPUs map or the Rx-queues map. The config option CONFIG_XPS needs to be
> enabled. By default no receive queues are configured for the Tx queue.
>
> - /sys/class/net/<dev>/queues/tx-*/xps_rxqs
>
> A set of receive queues can be mapped to a set of transmit queues (many:many),
> although the common use case is a 1:1 mapping. This will enable sending
> packets on the same Tx-Rx queue pair as this is useful for busy polling
> multi-threaded workloads where it is not possible to pin the threads to
> a CPU. This is a rework of Sridhar's patch for symmetric queueing via
> socket option:
> https://www.spinics.net/lists/netdev/msg453106.html
...
Thanks for doing this work.
I think this needs more time to sit and get reviews, and therefore needs
to be deferred to the next merge window.
^ permalink raw reply
* Re: [PATCH net] l2tp: fix refcount leakage on PPPoL2TP sockets
From: Guillaume Nault @ 2018-06-05 16:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jchapman
In-Reply-To: <20180605.094124.1023096251110931871.davem@davemloft.net>
On Tue, Jun 05, 2018 at 09:41:24AM -0400, David Miller wrote:
> From: Guillaume Nault <g.nault@alphalink.fr>
> Date: Mon, 4 Jun 2018 18:52:19 +0200
>
> > Commit d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session
> > object destroy") tried to fix a race condition where a PPPoL2TP socket
> > would disappear while the L2TP session was still using it. However, it
> > missed the root issue which is that an L2TP session may accept to be
> > reconnected if its associated socket has entered the release process.
> >
> > The tentative fix makes the session hold the socket it is connected to.
> > That saves the kernel from crashing, but introduces refcount leakage,
> > preventing the socket from completing the release process. Once stalled,
> > everything the socket depends on can't be released anymore, including
> > the L2TP session and the l2tp_ppp module.
> ...
> > So it all boils down to pppol2tp_connect() failing to realise that the
> > session has already been connected. This patch drops the unneeded extra
> > reference counting (mostly reverting d02ba2a6110c) and checks that
> > neither ->sk nor ->__sk is set before allowing a session to be
> > connected.
> >
> > Fixes: d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session object destroy")
> > Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
>
> So much fidgeting around in this area over the past year or two :-)
>
Putting L2TP into production without adding custom workarounds has been
such a long journey, but we're almost there :-)
> Applied and queued up for -stable, thanks for fixing this.
>
I still have a handful of issues to fix, though.
^ permalink raw reply
* pull-request: bpf-next 2018-06-05
From: Daniel Borkmann @ 2018-06-05 16:39 UTC (permalink / raw)
To: davem; +Cc: daniel, ast, netdev
Hi David,
The following pull-request contains BPF updates for your *net-next* tree.
The main changes are:
1) Add a new BPF hook for sendmsg similar to existing hooks for bind and
connect: "This allows to override source IP (including the case when it's
set via cmsg(3)) and destination IP:port for unconnected UDP (slow path).
TCP and connected UDP (fast path) are not affected. This makes UDP support
complete, that is, connected UDP is handled by connect hooks, unconnected
by sendmsg ones.", from Andrey.
2) Rework of the AF_XDP API to allow extending it in future for type writer
model if necessary. In this mode a memory window is passed to hardware
and multiple frames might be filled into that window instead of just one
that is the case in the current fixed frame-size model. With the new
changes made this can be supported without having to add a new descriptor
format. Also, core bits for the zero-copy support for AF_XDP have been
merged as agreed upon, where i40e bits will be routed via Jeff later on.
Various improvements to documentation and sample programs included as
well, all from Björn and Magnus.
3) Given BPF's flexibility, a new program type has been added to implement
infrared decoders. Quote: "The kernel IR decoders support the most
widely used IR protocols, but there are many protocols which are not
supported. [...] There is a 'long tail' of unsupported IR protocols,
for which lircd is need to decode the IR. IR encoding is done in such
a way that some simple circuit can decode it; therefore, BPF is ideal.
[...] user-space can define a decoder in BPF, attach it to the rc
device through the lirc chardev.", from Sean.
4) Several improvements and fixes to BPF core, among others, dumping map
and prog IDs into fdinfo which is a straight forward way to correlate
BPF objects used by applications, removing an indirect call and therefore
retpoline in all map lookup/update/delete calls by invoking the callback
directly for 64 bit archs, adding a new bpf_skb_cgroup_id() BPF helper
for tc BPF programs to have an efficient way of looking up cgroup v2 id
for policy or other use cases. Fixes to make sure we zero tunnel/xfrm
state that hasn't been filled, to allow context access wrt pt_regs in
32 bit archs for tracing, and last but not least various test cases
for fixes that landed in bpf earlier, from Daniel.
5) Get rid of the ndo_xdp_flush API and extend the ndo_xdp_xmit with
a XDP_XMIT_FLUSH flag instead which allows to avoid one indirect
call as flushing is now merged directly into ndo_xdp_xmit(), from Jesper.
6) Add a new bpf_get_current_cgroup_id() helper that can be used in
tracing to retrieve the cgroup id from the current process in order
to allow for e.g. aggregation of container-level events, from Yonghong.
7) Two follow-up fixes for BTF to reject invalid input values and
related to that also two test cases for BPF kselftests, from Martin.
8) Various API improvements to the bpf_fib_lookup() helper, that is,
dropping MPLS bits which are not fully hashed out yet, rejecting
invalid helper flags, returning error for unsupported address
families as well as renaming flowlabel to flowinfo, from David.
9) Various fixes and improvements to sockmap BPF kselftests in particular
in proper error detection and data verification, from Prashant.
10) Two arm32 BPF JIT improvements. One is to fix imm range check with
regards to whether immediate fits into 24 bits, and a naming cleanup
to get functions related to rsh handling consistent to those handling
lsh, from Wang.
11) Two compile warning fixes in BPF, one for BTF and a false positive
to silent gcc in stack_map_get_build_id_offset(), from Arnd.
12) Add missing seg6.h header into tools include infrastructure in order
to fix compilation of BPF kselftests, from Mathieu.
13) Several formatting cleanups in the BPF UAPI helper description that
also fix an error during rst2man compilation, from Quentin.
14) Hide an unused variable in sk_msg_convert_ctx_access() when IPv6 is
not built into the kernel, from Yue.
15) Remove a useless double assignment in dev_map_enqueue(), from Colin.
Please consider pulling these changes from:
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
Thanks a lot!
----------------------------------------------------------------
The following changes since commit 5b79c2af667c0e2684f2a6dbf6439074b78f490c:
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2018-05-26 19:46:15 -0400)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
for you to fetch changes up to 9fa06104a235f64d6a2bf3012cc9966e8e4be5eb:
Merge branch 'bpf-af-xdp-zc-api' (2018-06-05 15:58:07 +0200)
----------------------------------------------------------------
Alexei Starovoitov (4):
Merge branch 'btf-fixes'
Merge branch 'misc-BPF-improvements'
Merge branch 'ndo_xdp_xmit-cleanup'
Merge branch 'bpf_get_current_cgroup_id'
Andrey Ignatov (8):
libbpf: Install btf.h with libbpf
bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n
bpf: Hooks for sys_sendmsg
bpf: Sync bpf.h to tools/
libbpf: Support guessing sendmsg{4,6} progs
selftests/bpf: Prepare test_sock_addr for extension
selftests/bpf: Selftest for sys_sendmsg hooks
bpftool: Support sendmsg{4,6} attach types
Arnd Bergmann (2):
bpf: btf: avoid -Wreturn-type warning
bpf: avoid -Wmaybe-uninitialized warning
Björn Töpel (10):
xsk: proper fill queue descriptor validation
xsk: proper Rx drop statistics update
xsk: new descriptor addressing scheme
samples/bpf: adapted to new uapi
xsk: moved struct xdp_umem definition
xsk: introduce xdp_umem_page
net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM
xdp: add MEM_TYPE_ZERO_COPY
xsk: add zero-copy support for Rx
samples/bpf: xdpsock: use skb Tx path for XDP_SKB
Colin Ian King (1):
bpf: devmap: remove redundant assignment of dev = dev
Daniel Borkmann (17):
Merge branch 'bpf-sendmsg-hook'
Merge branch 'bpf-ir-decoder'
Merge branch 'bpf-sockmap-test-fixes'
bpf: test case for map pointer poison with calls/branches
bpf: add also cbpf long jump test cases with heavy expansion
bpf: fixup error message from gpl helpers on license mismatch
bpf: show prog and map id in fdinfo
bpf: avoid retpoline for lookup/update/delete calls on maps
bpf: add bpf_skb_cgroup_id helper
bpf: make sure to clear unused fields in tunnel/xfrm state fetch
bpf: fix cbpf parser bug for octal numbers
bpf: fix context access in tracing progs on 32 bit archs
bpf: sync bpf uapi header with tools
bpf, doc: add missing patchwork url and libbpf to maintainers
Merge branch 'bpf-af-xdp-fixes'
Merge branch 'bpf-xdp-remove-xdp-flush'
Merge branch 'bpf-af-xdp-zc-api'
David Ahern (4):
bpf: Drop mpls from bpf_fib_lookup
bpf: Verify flags in bpf_fib_lookup
bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families
bpf: flowlabel in bpf_fib_lookup should be flowinfo
Jesper Dangaard Brouer (13):
xdp: add flags argument to ndo_xdp_xmit API
i40e: implement flush flag for ndo_xdp_xmit
ixgbe: implement flush flag for ndo_xdp_xmit
tun: implement flush flag for ndo_xdp_xmit
virtio_net: implement flush flag for ndo_xdp_xmit
xdp: done implementing ndo_xdp_xmit flush flag for all drivers
bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
bpf/xdp: devmap can avoid calling ndo_xdp_flush
i40e: remove ndo_xdp_flush call i40e_xdp_flush
ixgbe: remove ndo_xdp_flush call ixgbe_xdp_flush
virtio_net: remove ndo_xdp_flush call virtnet_xdp_flush
tun: remove ndo_xdp_flush call tun_xdp_flush
net: remove net_device operation ndo_xdp_flush
Magnus Karlsson (3):
samples/bpf: minor *_nb_free performance fix
net: added netdevice operation for Tx
xsk: wire upp Tx zero-copy functions
Martin KaFai Lau (2):
bpf: btf: Check array t->size
bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD
Mathieu Xhonneux (1):
selftests/bpf: missing headers test_lwt_seg6local
Prashant Bhole (5):
selftests/bpf: test_sockmap, check test failure
selftests/bpf: test_sockmap, join cgroup in selftest mode
selftests/bpf: test_sockmap, timing improvements
selftests/bpf: test_sockmap, fix data verification
selftests/bpf: test_sockmap, print additional test options
Quentin Monnet (1):
bpf: clean up eBPF helpers documentation
Sean Young (3):
bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found
media: rc: introduce BPF_PROG_LIRC_MODE2
bpf: add selftest for lirc_mode2 type program
Wang YanQing (2):
bpf, arm32: correct check_imm24
bpf, arm32: fix inconsistent naming about emit_a32_lsr_{r64,i64}
Yonghong Song (4):
bpf: implement bpf_get_current_cgroup_id() helper
tools/bpf: sync uapi bpf.h for bpf_get_current_cgroup_id() helper
tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper
bpf: guard bpf_get_current_cgroup_id() with CONFIG_CGROUPS
YueHaibing (1):
bpf: hide the unused 'off' variable
Documentation/networking/af_xdp.rst | 101 +-
MAINTAINERS | 2 +
arch/arm/net/bpf_jit_32.c | 16 +-
drivers/media/rc/Kconfig | 13 +
drivers/media/rc/Makefile | 1 +
drivers/media/rc/bpf-lirc.c | 313 ++++++
drivers/media/rc/lirc_dev.c | 30 +
drivers/media/rc/rc-core-priv.h | 21 +
drivers/media/rc/rc-ir-raw.c | 12 +-
drivers/net/ethernet/intel/i40e/i40e_main.c | 1 -
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 33 +-
drivers/net/ethernet/intel/i40e/i40e_txrx.h | 4 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 42 +-
drivers/net/tun.c | 44 +-
drivers/net/virtio_net.c | 22 +-
include/linux/bpf-cgroup.h | 24 +-
include/linux/bpf.h | 1 +
include/linux/bpf_lirc.h | 29 +
include/linux/bpf_types.h | 3 +
include/linux/filter.h | 44 +-
include/linux/netdevice.h | 21 +-
include/net/xdp.h | 14 +
include/net/xdp_sock.h | 44 +-
include/uapi/linux/bpf.h | 136 ++-
include/uapi/linux/if_xdp.h | 16 +-
kernel/bpf/btf.c | 28 +-
kernel/bpf/cgroup.c | 11 +-
kernel/bpf/core.c | 12 +-
kernel/bpf/devmap.c | 21 +-
kernel/bpf/hashtab.c | 12 +-
kernel/bpf/helpers.c | 15 +
kernel/bpf/stackmap.c | 7 +-
kernel/bpf/syscall.c | 27 +-
kernel/bpf/verifier.c | 73 +-
kernel/trace/bpf_trace.c | 16 +-
lib/test_bpf.c | 63 ++
net/core/filter.c | 91 +-
net/core/xdp.c | 19 +-
net/ipv4/udp.c | 20 +-
net/ipv6/udp.c | 24 +
net/xdp/xdp_umem.c | 151 ++-
net/xdp/xdp_umem.h | 45 +-
net/xdp/xdp_umem_props.h | 4 +-
net/xdp/xsk.c | 199 +++-
net/xdp/xsk_queue.c | 2 +-
net/xdp/xsk_queue.h | 98 +-
samples/bpf/xdp_fwd_kern.c | 2 +-
samples/bpf/xdpsock_user.c | 97 +-
tools/bpf/bpf_exp.l | 2 +-
tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 9 +-
tools/bpf/bpftool/bash-completion/bpftool | 5 +-
tools/bpf/bpftool/cgroup.c | 4 +-
tools/bpf/bpftool/prog.c | 1 +
tools/include/linux/filter.h | 10 +
tools/include/uapi/linux/bpf.h | 134 ++-
tools/include/uapi/linux/lirc.h | 217 ++++
tools/include/uapi/linux/seg6.h | 55 +
tools/include/uapi/linux/seg6_local.h | 80 ++
tools/lib/bpf/Makefile | 1 +
tools/lib/bpf/libbpf.c | 3 +
tools/testing/selftests/bpf/.gitignore | 2 +
tools/testing/selftests/bpf/Makefile | 9 +-
tools/testing/selftests/bpf/bpf_helpers.h | 7 +
tools/testing/selftests/bpf/cgroup_helpers.c | 57 +
tools/testing/selftests/bpf/cgroup_helpers.h | 1 +
tools/testing/selftests/bpf/get_cgroup_id_kern.c | 28 +
tools/testing/selftests/bpf/get_cgroup_id_user.c | 141 +++
tools/testing/selftests/bpf/sendmsg4_prog.c | 49 +
tools/testing/selftests/bpf/sendmsg6_prog.c | 60 +
tools/testing/selftests/bpf/test_btf.c | 45 +
tools/testing/selftests/bpf/test_lirc_mode2.sh | 28 +
tools/testing/selftests/bpf/test_lirc_mode2_kern.c | 23 +
tools/testing/selftests/bpf/test_lirc_mode2_user.c | 149 +++
tools/testing/selftests/bpf/test_sock_addr.c | 1155 ++++++++++++++++----
tools/testing/selftests/bpf/test_sockmap.c | 87 +-
tools/testing/selftests/bpf/test_verifier.c | 185 +++-
76 files changed, 3841 insertions(+), 730 deletions(-)
create mode 100644 drivers/media/rc/bpf-lirc.c
create mode 100644 include/linux/bpf_lirc.h
create mode 100644 tools/include/uapi/linux/lirc.h
create mode 100644 tools/include/uapi/linux/seg6.h
create mode 100644 tools/include/uapi/linux/seg6_local.h
create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_kern.c
create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c
create mode 100644 tools/testing/selftests/bpf/sendmsg4_prog.c
create mode 100644 tools/testing/selftests/bpf/sendmsg6_prog.c
create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
^ permalink raw reply
* Re: [PATCH net-next] rtnetlink: validate attributes in do_setlink()
From: David Miller @ 2018-06-05 16:41 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet, dvyukov
In-Reply-To: <20180605162519.230428-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Jun 2018 09:25:19 -0700
> It seems that rtnl_group_changelink() can call do_setlink
> while a prior call to validate_linkmsg(dev = NULL, ...) could
> not validate IFLA_ADDRESS / IFLA_BROADCAST
>
> Make sure do_setlink() calls validate_linkmsg() instead
> of letting its callers having this responsibility.
But now rtnl_newlink() will validate_linkmsg() twice....
^ permalink raw reply
* Re: [PATCH net-next] rtnetlink: validate attributes in do_setlink()
From: Eric Dumazet @ 2018-06-05 16:42 UTC (permalink / raw)
To: David Miller, edumazet; +Cc: netdev, eric.dumazet, dvyukov
In-Reply-To: <20180605.124103.1922429680259846762.davem@davemloft.net>
On 06/05/2018 09:41 AM, David Miller wrote:
> From: Eric Dumazet <edumazet@google.com>
> Date: Tue, 5 Jun 2018 09:25:19 -0700
>
>> It seems that rtnl_group_changelink() can call do_setlink
>> while a prior call to validate_linkmsg(dev = NULL, ...) could
>> not validate IFLA_ADDRESS / IFLA_BROADCAST
>>
>> Make sure do_setlink() calls validate_linkmsg() instead
>> of letting its callers having this responsibility.
>
> But now rtnl_newlink() will validate_linkmsg() twice....
>
Yes, is it a problem ? That is hardly fast path :)
^ permalink raw reply
* Re: [Patch net-next] netdev-FAQ: clarify DaveM's position for stable backports
From: Cong Wang @ 2018-06-05 16:44 UTC (permalink / raw)
To: David Miller; +Cc: Linux Kernel Network Developers, stable, Greg KH
In-Reply-To: <20180605.094347.1260769683352764390.davem@davemloft.net>
On Tue, Jun 5, 2018 at 6:43 AM, David Miller <davem@davemloft.net> wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> Date: Mon, 4 Jun 2018 11:07:19 -0700
>
>> +Q: Are all networking bug fixes backported to all stable releases?
>> +
>> +A: Due to capacity, Dave could only take care of the backports for the last
>> + 3 stable releases.
>
> As Greg stated, I only do 2 not 3.
Sure, will send v2.
We just need a number here. :)
Thanks!
^ permalink raw reply
* Re: [PATCH v5 1/3] drivers core: refactor device_shutdown
From: Andy Shevchenko @ 2018-06-05 16:44 UTC (permalink / raw)
To: Pavel Tatashin
Cc: Steven Sistare, Daniel Jordan, Linux Kernel Mailing List,
Kirsher, Jeffrey T, intel-wired-lan, netdev, Greg Kroah-Hartman,
Alexander Duyck, tobin
In-Reply-To: <20180516024004.28977-2-pasha.tatashin@oracle.com>
On Wed, May 16, 2018 at 5:40 AM, Pavel Tatashin
<pasha.tatashin@oracle.com> wrote:
> device_shutdown() traverses through the list of devices, and calls
> dev->{bug/driver}->shutdown() for each entry in the list.
>
> Refactor the function by keeping device_shutdown() to do the logic of
> traversing the list of devices, and device_shutdown_one() to perform the
> actual shutdown operation on one device.
>
FWIW,
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
> Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
> ---
> drivers/base/core.c | 50 +++++++++++++++++++++++++++------------------
> 1 file changed, 30 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index b610816eb887..ed189f6d1a2f 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -2765,6 +2765,35 @@ int device_move(struct device *dev, struct device *new_parent,
> }
> EXPORT_SYMBOL_GPL(device_move);
>
> +/*
> + * device_shutdown_one - call ->shutdown() for the device passed as
> + * argument.
> + */
> +static void device_shutdown_one(struct device *dev)
> +{
> + /* Don't allow any more runtime suspends */
> + pm_runtime_get_noresume(dev);
> + pm_runtime_barrier(dev);
> +
> + if (dev->class && dev->class->shutdown_pre) {
> + if (initcall_debug)
> + dev_info(dev, "shutdown_pre\n");
> + dev->class->shutdown_pre(dev);
> + }
> + if (dev->bus && dev->bus->shutdown) {
> + if (initcall_debug)
> + dev_info(dev, "shutdown\n");
> + dev->bus->shutdown(dev);
> + } else if (dev->driver && dev->driver->shutdown) {
> + if (initcall_debug)
> + dev_info(dev, "shutdown\n");
> + dev->driver->shutdown(dev);
> + }
> +
> + /* decrement the reference counter */
> + put_device(dev);
> +}
> +
> /**
> * device_shutdown - call ->shutdown() on each device to shutdown.
> */
> @@ -2801,30 +2830,11 @@ void device_shutdown(void)
> device_lock(parent);
> device_lock(dev);
>
> - /* Don't allow any more runtime suspends */
> - pm_runtime_get_noresume(dev);
> - pm_runtime_barrier(dev);
> -
> - if (dev->class && dev->class->shutdown_pre) {
> - if (initcall_debug)
> - dev_info(dev, "shutdown_pre\n");
> - dev->class->shutdown_pre(dev);
> - }
> - if (dev->bus && dev->bus->shutdown) {
> - if (initcall_debug)
> - dev_info(dev, "shutdown\n");
> - dev->bus->shutdown(dev);
> - } else if (dev->driver && dev->driver->shutdown) {
> - if (initcall_debug)
> - dev_info(dev, "shutdown\n");
> - dev->driver->shutdown(dev);
> - }
> -
> + device_shutdown_one(dev);
> device_unlock(dev);
> if (parent)
> device_unlock(parent);
>
> - put_device(dev);
> put_device(parent);
>
> spin_lock(&devices_kset->list_lock);
> --
> 2.17.0
>
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply
* Re: [PATCH net-next] rtnetlink: validate attributes in do_setlink()
From: David Miller @ 2018-06-05 16:45 UTC (permalink / raw)
To: eric.dumazet; +Cc: edumazet, netdev, dvyukov
In-Reply-To: <66acac36-a304-b743-8c6e-ef9cc87366d3@gmail.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 5 Jun 2018 09:42:54 -0700
> On 06/05/2018 09:41 AM, David Miller wrote:
>> From: Eric Dumazet <edumazet@google.com>
>> Date: Tue, 5 Jun 2018 09:25:19 -0700
>>
>>> It seems that rtnl_group_changelink() can call do_setlink
>>> while a prior call to validate_linkmsg(dev = NULL, ...) could
>>> not validate IFLA_ADDRESS / IFLA_BROADCAST
>>>
>>> Make sure do_setlink() calls validate_linkmsg() instead
>>> of letting its callers having this responsibility.
>>
>> But now rtnl_newlink() will validate_linkmsg() twice....
>>
>
> Yes, is it a problem ? That is hardly fast path :)
Not a problem, just making sure you were aware.
^ permalink raw reply
* Re: [PATCH net-next] rtnetlink: validate attributes in do_setlink()
From: David Miller @ 2018-06-05 16:46 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet, dvyukov
In-Reply-To: <20180605162519.230428-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Jun 2018 09:25:19 -0700
> It seems that rtnl_group_changelink() can call do_setlink
> while a prior call to validate_linkmsg(dev = NULL, ...) could
> not validate IFLA_ADDRESS / IFLA_BROADCAST
>
> Make sure do_setlink() calls validate_linkmsg() instead
> of letting its callers having this responsibility.
>
> With help from Dmitry Vyukov, thanks a lot !
...
> Fixes: e7ed828f10bd ("netlink: support setting devgroup parameters")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
Applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH] r8169: Reinstate ALDPS and ASPM support
From: Florian Fainelli @ 2018-06-05 16:47 UTC (permalink / raw)
To: David Miller, andrew
Cc: kai.heng.feng, hayeswang, hkallweit1, romieu, netdev,
linux-kernel, ryankao, jiri
In-Reply-To: <20180605.101532.899235999013307302.davem@davemloft.net>
On 06/05/2018 07:15 AM, David Miller wrote:
> From: Andrew Lunn <andrew@lunn.ch>
> Date: Tue, 5 Jun 2018 16:11:14 +0200
>
>> No module parameter please. Just turn it on by default. Assuming
>> testing shows works.
>
> Agreed.
devlink would be a good candidate to add such configuration attributes,
since you would be operating on the PCI function itself, thus allowing
this to be on a per-device instance basis as opposed to global, which is
what a module parameter is.
--
Florian
^ permalink raw reply
* [Patch net v2] netdev-FAQ: clarify DaveM's position for stable backports
From: Cong Wang @ 2018-06-05 16:48 UTC (permalink / raw)
To: netdev; +Cc: Cong Wang, stable, Greg Kroah-Hartman
Per discussion with David at netconf 2018, let's clarify
DaveM's position of handling stable backports in netdev-FAQ.
This is important for people relying on upstream -stable
releases.
Cc: stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
Documentation/networking/netdev-FAQ.txt | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt
index 2a3278d5cf35..fa951b820b25 100644
--- a/Documentation/networking/netdev-FAQ.txt
+++ b/Documentation/networking/netdev-FAQ.txt
@@ -179,6 +179,15 @@ A: No. See above answer. In short, if you think it really belongs in
dash marker line as described in Documentation/process/submitting-patches.rst to
temporarily embed that information into the patch that you send.
+Q: Are all networking bug fixes backported to all stable releases?
+
+A: Due to capacity, Dave could only take care of the backports for the last
+ 2 stable releases. For earlier stable releases, each stable branch maintainer
+ is supposed to take care of them. If you find any patch is missing from an
+ earlier stable branch, please notify stable@vger.kernel.org with either a
+ commit ID or a formal patch backported, and CC Dave and other relevant
+ networking developers.
+
Q: Someone said that the comment style and coding convention is different
for the networking content. Is this true?
--
2.13.0
^ permalink raw reply related
* Re: [PATCH iproute2 v2 1/2] ip: display netns name instead of nsid
From: Stephen Hemminger @ 2018-06-05 16:52 UTC (permalink / raw)
To: Nicolas Dichtel; +Cc: netdev
In-Reply-To: <20180605130831.8175-2-nicolas.dichtel@6wind.com>
On Tue, 5 Jun 2018 15:08:30 +0200
Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:
>
> +char *get_name_from_nsid(int nsid)
> +{
> + struct nsid_cache *c;
> +
> + netns_nsid_socket_init();
> + netns_map_init();
> +
> + c = netns_map_get_by_nsid(nsid);
> + if (c)
> + return c->name;
> +
> + return NULL;
> +}
> +
This is better, but now there is a different problem.
When doing multiple interfaces, won't the initialization code be called twice?
^ permalink raw reply
* Re: [PATCH v5 1/3] drivers core: refactor device_shutdown
From: Pavel Tatashin @ 2018-06-05 17:14 UTC (permalink / raw)
To: andy.shevchenko
Cc: Steven Sistare, Daniel Jordan, LKML, jeffrey.t.kirsher,
intel-wired-lan, netdev, gregkh, Alexander Duyck, tobin
In-Reply-To: <CAHp75Vduh4FG=K56-8bhoDPic_BXcV3UfV8Mxp_UZOeqeJ6F8w@mail.gmail.com>
> FWIW,
> Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
>
Thank you Andy.
Pavel
^ permalink raw reply
* Re: [PATCH net] failover: eliminate callback hell
From: Samudrala, Sridhar @ 2018-06-05 17:22 UTC (permalink / raw)
To: Stephen Hemminger, kys, haiyangz, davem, mst, Alexander H,
Jiri Pirko
Cc: netdev, Stephen Hemminger, Brandeburg, Jesse
In-Reply-To: <20180605034231.31610-1-sthemmin@microsoft.com>
On 6/4/2018 8:42 PM, Stephen Hemminger wrote:
> The net failover should be a simple library, not a virtual
> object with function callbacks (see callback hell).
> The code is simpler is smaller both for the netvsc and virtio use case.
I quickly tried this patch and it breaks virtio-net in standby mode.
I don't see failover netdev, unloading virtio-net causes a crash.
With these changes, there is very minimal code that is shared between
netvsc and virtio-net. The notifier and event handling code and the
lookup_bymac routines are now duplicated in both the drivers. I thought
we wanted to keep this code common between the 2 drivers and we went through
multiple revisions to make sure that it works with both netvsc's 2 netdev
and virtio-net's 3 netdev models.
The reason for the indirect ops is to support these 2 different models and
i am not sure if the overhead of the callbacks is that significant considering
that they are not called in the hot path.
>
> The code is restructured in many ways. I should have given these
> as review comments to net_failover during review
> but did not want to overwhelm the original submitter.
> Therefore it was merged prematurely.
>
> Some of the many items changed are:
>
> * The support routines should just be selected as needed in
> kernel config, no need for them to be visible config items.
>
> * Both netvsc and net_failover should keep their list of their
> own devices. Not a common list.
>
> * The matching of secondary device to primary device policy
> is up to the network device. Both net_failover and netvsc
> will use MAC for now but can change separately.
>
> * The match policy is only used during initial discovery; after
> that the secondary device knows what the upper device is because
> of the parent/child relationship; no searching is required.
>
> * Now, netvsc and net_failover use the same delayed work type
> mechanism for setup. Previously, net_failover code was triggering off
> name change but a similar policy was rejected for netvsc.
> "what is good for the goose is good for the gander"
>
> * The net_failover private device info 'struct net_failover_info'
> should have been private to the driver file, not a visible
> API.
>
> * The net_failover device should use SET_NETDEV_DEV
> that is intended only for physical devices not virtual devices.
>
> * No point in having DocBook style comments on a driver file.
> They only make sense on an external exposed API.
>
> * net_failover only supports Ethernet, so use ether_addr_copy.
>
> * Set permanent and current address of net_failover device
> to match the primary.
>
> * Carrier should be marked off before registering device
> the net_failover device.
>
> * Use netdev_XXX for log messages, in net_failover (not dev_xxx)
>
> * Since failover infrastructure is about linking devices just
> use RTNL no need for other locking in init and teardown.
>
> * Don't bother with ERR_PTR() style return if only possible
> return is success or no memory.
>
> * As much as possible, the terms master and slave should be avoided
> because of their cultural connotations.
>
> Note; this code has been tested on Hyper-V
> but is compile tested only on virtio.
>
> Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module")
> Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
> ---
>
> Although this patch needs to go into 4.18 (linux-net),
> this version is based against net-next because net-next
> hasn't been merged into linux-net yet.
>
>
> drivers/net/hyperv/hyperv_net.h | 3 +-
> drivers/net/hyperv/netvsc_drv.c | 173 +++++++++++------
> drivers/net/net_failover.c | 312 ++++++++++++++++++++-----------
> drivers/net/virtio_net.c | 9 +-
> include/net/failover.h | 31 +---
> include/net/net_failover.h | 32 +---
> net/Kconfig | 13 +-
> net/core/failover.c | 316 ++++----------------------------
> 8 files changed, 373 insertions(+), 516 deletions(-)
>
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index 99d8e7398a5b..c7d25d10765e 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -902,6 +902,8 @@ struct net_device_context {
> struct hv_device *device_ctx;
> /* netvsc_device */
> struct netvsc_device __rcu *nvdev;
> + /* list of netvsc net_devices */
> + struct list_head list;
> /* reconfigure work */
> struct delayed_work dwork;
> /* last reconfig time */
> @@ -933,7 +935,6 @@ struct net_device_context {
> /* Serial number of the VF to team with */
> u32 vf_serial;
>
> - struct failover *failover;
> };
>
> /* Per channel data */
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index bef4d55a108c..074e6b8578df 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -70,6 +70,8 @@ static int debug = -1;
> module_param(debug, int, 0444);
> MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>
> +static LIST_HEAD(netvsc_dev_list);
> +
> static void netvsc_change_rx_flags(struct net_device *net, int change)
> {
> struct net_device_context *ndev_ctx = netdev_priv(net);
> @@ -1846,101 +1848,120 @@ static void netvsc_vf_setup(struct work_struct *w)
> }
>
> vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> - if (vf_netdev)
> + if (vf_netdev) {
> __netvsc_vf_setup(ndev, vf_netdev);
> -
> + dev_put(vf_netdev);
> + }
> rtnl_unlock();
> }
>
> -static int netvsc_pre_register_vf(struct net_device *vf_netdev,
> - struct net_device *ndev)
> +static struct net_device *get_netvsc_bymac(const u8 *mac)
> {
> - struct net_device_context *net_device_ctx;
> - struct netvsc_device *netvsc_dev;
> + struct net_device_context *ndev_ctx;
>
> - net_device_ctx = netdev_priv(ndev);
> - netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
> - if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
> - return -ENODEV;
> + ASSERT_RTNL();
>
> - return 0;
> + list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
> + struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx);
> +
> + if (ether_addr_equal(mac, dev->perm_addr))
> + return dev;
> + }
> +
> + return NULL;
> }
>
> -static int netvsc_register_vf(struct net_device *vf_netdev,
> - struct net_device *ndev)
> +static int netvsc_register_vf(struct net_device *vf_netdev)
> {
> - struct net_device_context *ndev_ctx = netdev_priv(ndev);
> + struct net_device *ndev;
> + struct net_device_context *ndev_ctx;
> +
> + /* Must use Ethernet addresses */
> + if (vf_netdev->addr_len != ETH_ALEN)
> + return NOTIFY_DONE;
> +
> + /* VF must be a physical device not VLAN, etc */
> + if (!vf_netdev->dev.parent)
> + return NOTIFY_DONE;
> +
> + /* Use the MAC address to locate the synthetic interface to
> + * associate with the VF interface.
> + */
> + ndev = get_netvsc_bymac(vf_netdev->perm_addr);
> + if (!ndev)
> + return NOTIFY_DONE;
> +
> + /* If network device is being removed, don't do anything */
> + ndev_ctx = netdev_priv(ndev);
> + if (!rtnl_dereference(ndev_ctx->nvdev))
> + return NOTIFY_DONE;
> +
> + if (netdev_failover_join(vf_netdev, ndev, netvsc_vf_handle_frame)) {
> + netdev_err(vf_netdev, "could not join: %s", ndev->name);
> + return NOTIFY_DONE;
> + }
>
> /* set slave flag before open to prevent IPv6 addrconf */
> vf_netdev->flags |= IFF_SLAVE;
>
> + dev_hold(vf_netdev);
> +
> schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
>
> call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
>
> netdev_info(vf_netdev, "joined to %s\n", ndev->name);
>
> - dev_hold(vf_netdev);
> rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
>
> - return 0;
> + return NOTIFY_OK;
> }
>
> /* VF up/down change detected, schedule to change data path */
> -static int netvsc_vf_changed(struct net_device *vf_netdev,
> - struct net_device *ndev)
> +static int netvsc_vf_changed(struct net_device *vf_netdev)
> {
> struct net_device_context *net_device_ctx;
> struct netvsc_device *netvsc_dev;
> + struct net_device *ndev;
> bool vf_is_up = netif_running(vf_netdev);
>
> + ndev = netdev_failover_upper_get(vf_netdev);
> + if (!ndev)
> + return NOTIFY_DONE;
> +
> net_device_ctx = netdev_priv(ndev);
> netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
> if (!netvsc_dev)
> - return -ENODEV;
> + return NOTIFY_DONE;
>
> netvsc_switch_datapath(ndev, vf_is_up);
> netdev_info(ndev, "Data path switched %s VF: %s\n",
> vf_is_up ? "to" : "from", vf_netdev->name);
>
> - return 0;
> + return NOTIFY_OK;
> }
>
> -static int netvsc_pre_unregister_vf(struct net_device *vf_netdev,
> - struct net_device *ndev)
> +static int netvsc_unregister_vf(struct net_device *vf_netdev)
> {
> struct net_device_context *net_device_ctx;
> + struct net_device *ndev;
>
> - net_device_ctx = netdev_priv(ndev);
> - cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
> -
> - return 0;
> -}
> -
> -static int netvsc_unregister_vf(struct net_device *vf_netdev,
> - struct net_device *ndev)
> -{
> - struct net_device_context *net_device_ctx;
> + ndev = netdev_failover_upper_get(vf_netdev);
> + if (!ndev)
> + return NOTIFY_DONE;
>
> net_device_ctx = netdev_priv(ndev);
> + if (cancel_delayed_work_sync(&net_device_ctx->vf_takeover))
> + dev_put(vf_netdev);
>
> netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
>
> + netdev_failover_unjoin(vf_netdev, ndev);
> RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
> - dev_put(vf_netdev);
>
> - return 0;
> + return NOTIFY_OK;
> }
>
> -static struct failover_ops netvsc_failover_ops = {
> - .slave_pre_register = netvsc_pre_register_vf,
> - .slave_register = netvsc_register_vf,
> - .slave_pre_unregister = netvsc_pre_unregister_vf,
> - .slave_unregister = netvsc_unregister_vf,
> - .slave_link_change = netvsc_vf_changed,
> - .slave_handle_frame = netvsc_vf_handle_frame,
> -};
> -
> static int netvsc_probe(struct hv_device *dev,
> const struct hv_vmbus_device_id *dev_id)
> {
> @@ -2009,6 +2030,8 @@ static int netvsc_probe(struct hv_device *dev,
>
> memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
>
> + net->priv_flags |= IFF_FAILOVER;
> +
> /* hw_features computed in rndis_netdev_set_hwcaps() */
> net->features = net->hw_features |
> NETIF_F_HIGHDMA | NETIF_F_SG |
> @@ -2024,23 +2047,19 @@ static int netvsc_probe(struct hv_device *dev,
> else
> net->max_mtu = ETH_DATA_LEN;
>
> - ret = register_netdev(net);
> + rtnl_lock();
> + ret = register_netdevice(net);
> if (ret != 0) {
> pr_err("Unable to register netdev.\n");
> goto register_failed;
> }
>
> - net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
> - if (IS_ERR(net_device_ctx->failover)) {
> - ret = PTR_ERR(net_device_ctx->failover);
> - goto err_failover;
> - }
> -
> - return ret;
> + list_add(&net_device_ctx->list, &netvsc_dev_list);
> + rtnl_unlock();
> + return 0;
>
> -err_failover:
> - unregister_netdev(net);
> register_failed:
> + rtnl_unlock();
> rndis_filter_device_remove(dev, nvdev);
> rndis_failed:
> free_percpu(net_device_ctx->vf_stats);
> @@ -2079,15 +2098,17 @@ static int netvsc_remove(struct hv_device *dev)
> */
> rtnl_lock();
> vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> - if (vf_netdev)
> - failover_slave_unregister(vf_netdev);
> + if (vf_netdev) {
> + netdev_failover_unjoin(vf_netdev, net);
> + dev_put(vf_netdev);
> + }
>
> if (nvdev)
> rndis_filter_device_remove(dev, nvdev);
>
> unregister_netdevice(net);
>
> - failover_unregister(ndev_ctx->failover);
> + list_del(&ndev_ctx->list);
>
> rtnl_unlock();
> rcu_read_unlock();
> @@ -2115,8 +2136,47 @@ static struct hv_driver netvsc_drv = {
> .remove = netvsc_remove,
> };
>
> +/* On Hyper-V, every VF interface is matched with a corresponding
> + * synthetic interface. The synthetic interface is presented first
> + * to the guest. When the corresponding VF instance is registered,
> + * we will take care of switching the data path.
> + */
> +static int netvsc_netdev_event(struct notifier_block *this,
> + unsigned long event, void *ptr)
> +{
> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> + /* Skip parent events */
> + if (netif_is_failover(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid non-Ethernet type devices */
> + if (event_dev->type != ARPHRD_ETHER)
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_REGISTER:
> + return netvsc_register_vf(event_dev);
> +
> + case NETDEV_UNREGISTER:
> + return netvsc_unregister_vf(event_dev);
> +
> + case NETDEV_UP:
> + case NETDEV_DOWN:
> + return netvsc_vf_changed(event_dev);
> +
> + default:
> + return NOTIFY_DONE;
> + }
> +}
> +
> +static struct notifier_block netvsc_netdev_notifier = {
> + .notifier_call = netvsc_netdev_event,
> +};
> +
> static void __exit netvsc_drv_exit(void)
> {
> + unregister_netdevice_notifier(&netvsc_netdev_notifier);
> vmbus_driver_unregister(&netvsc_drv);
> }
>
> @@ -2136,6 +2196,7 @@ static int __init netvsc_drv_init(void)
> if (ret)
> return ret;
>
> + register_netdevice_notifier(&netvsc_netdev_notifier);
> return 0;
> }
>
> diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
> index 83f7420ddea5..e0d30527f748 100644
> --- a/drivers/net/net_failover.c
> +++ b/drivers/net/net_failover.c
> @@ -28,6 +28,46 @@
> #include <uapi/linux/if_arp.h>
> #include <net/net_failover.h>
>
> +static LIST_HEAD(net_failover_list);
> +
> +/* failover state */
> +struct net_failover_info {
> + struct net_device *failover_dev;
> +
> + /* list of failover virtual devices */
> + struct list_head list;
> +
> + /* primary netdev with same MAC */
> + struct net_device __rcu *primary_dev;
> +
> + /* standby netdev */
> + struct net_device __rcu *standby_dev;
> +
> + /* primary netdev stats */
> + struct rtnl_link_stats64 primary_stats;
> +
> + /* standby netdev stats */
> + struct rtnl_link_stats64 standby_stats;
> +
> + /* aggregated stats */
> + struct rtnl_link_stats64 failover_stats;
> +
> + /* spinlock while updating stats */
> + spinlock_t stats_lock;
> +
> + /* delayed setup of slave */
> + struct delayed_work standby_init;
> +};
> +
> +#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
> + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
> + NETIF_F_HIGHDMA | NETIF_F_LRO)
> +
> +#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
> + NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
> +
> +#define FAILOVER_SETUP_INTERVAL (HZ / 10)
> +
> static bool net_failover_xmit_ready(struct net_device *dev)
> {
> return netif_running(dev) && netif_carrier_ok(dev);
> @@ -460,22 +500,42 @@ static void net_failover_lower_state_changed(struct net_device *slave_dev,
> netdev_lower_state_changed(slave_dev, &info);
> }
>
> -static int net_failover_slave_pre_register(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> +static struct net_device *get_net_failover_bymac(const u8 *mac)
> {
> - struct net_device *standby_dev, *primary_dev;
> + struct net_failover_info *nfo_info;
> +
> + ASSERT_RTNL();
> +
> + list_for_each_entry(nfo_info, &net_failover_list, list) {
> + struct net_device *failover_dev = nfo_info->failover_dev;
> +
> + if (ether_addr_equal(mac, failover_dev->perm_addr))
> + return failover_dev;
> + }
> +
> + return NULL;
> +}
> +
> +static int net_failover_register_event(struct net_device *slave_dev)
> +{
> + struct net_device *failover_dev, *standby_dev, *primary_dev;
> struct net_failover_info *nfo_info;
> bool slave_is_standby;
>
> + failover_dev = get_net_failover_bymac(slave_dev->perm_addr);
> + if (!failover_dev)
> + return NOTIFY_DONE;
> +
> nfo_info = netdev_priv(failover_dev);
> standby_dev = rtnl_dereference(nfo_info->standby_dev);
> primary_dev = rtnl_dereference(nfo_info->primary_dev);
> slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
> if (slave_is_standby ? standby_dev : primary_dev) {
> - netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
> + netdev_err(failover_dev,
> + "%s attempting to register as slave dev when %s already present\n",
> slave_dev->name,
> slave_is_standby ? "standby" : "primary");
> - return -EINVAL;
> + return NOTIFY_DONE;
> }
>
> /* We want to allow only a direct attached VF device as a primary
> @@ -484,23 +544,33 @@ static int net_failover_slave_pre_register(struct net_device *slave_dev,
> */
> if (!slave_is_standby && (!slave_dev->dev.parent ||
> !dev_is_pci(slave_dev->dev.parent)))
> - return -EINVAL;
> + return NOTIFY_DONE;
>
> if (failover_dev->features & NETIF_F_VLAN_CHALLENGED &&
> vlan_uses_dev(failover_dev)) {
> - netdev_err(failover_dev, "Device %s is VLAN challenged and failover device has VLAN set up\n",
> + netdev_err(failover_dev,
> + "Device %s is VLAN challenged and failover device has VLAN set up\n",
> failover_dev->name);
> - return -EINVAL;
> + return NOTIFY_DONE;
> }
>
> - return 0;
> + if (netdev_failover_join(slave_dev, failover_dev,
> + net_failover_handle_frame)) {
> + netdev_err(failover_dev, "could not join: %s", slave_dev->name);
> + return NOTIFY_DONE;
> + }
> +
> + /* Trigger rest of setup in process context */
> + schedule_delayed_work(&nfo_info->standby_init, FAILOVER_SETUP_INTERVAL);
> +
> + return NOTIFY_OK;
> }
>
> -static int net_failover_slave_register(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> +static void __net_failover_setup(struct net_device *failover_dev)
> {
> + struct net_failover_info *nfo_info = netdev_priv(failover_dev);
> + struct net_device *slave_dev = rtnl_dereference(nfo_info->standby_dev);
> struct net_device *standby_dev, *primary_dev;
> - struct net_failover_info *nfo_info;
> bool slave_is_standby;
> u32 orig_mtu;
> int err;
> @@ -509,13 +579,12 @@ static int net_failover_slave_register(struct net_device *slave_dev,
> orig_mtu = slave_dev->mtu;
> err = dev_set_mtu(slave_dev, failover_dev->mtu);
> if (err) {
> - netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
> + netdev_err(failover_dev,
> + "unable to change mtu of %s to %u register failed\n",
> slave_dev->name, failover_dev->mtu);
> goto done;
> }
>
> - dev_hold(slave_dev);
> -
> if (netif_running(failover_dev)) {
> err = dev_open(slave_dev);
> if (err && (err != -EBUSY)) {
> @@ -537,7 +606,6 @@ static int net_failover_slave_register(struct net_device *slave_dev,
> goto err_vlan_add;
> }
>
> - nfo_info = netdev_priv(failover_dev);
> standby_dev = rtnl_dereference(nfo_info->standby_dev);
> primary_dev = rtnl_dereference(nfo_info->primary_dev);
> slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
> @@ -562,52 +630,56 @@ static int net_failover_slave_register(struct net_device *slave_dev,
> netdev_info(failover_dev, "failover %s slave:%s registered\n",
> slave_is_standby ? "standby" : "primary", slave_dev->name);
>
> - return 0;
> + return;
>
> err_vlan_add:
> dev_uc_unsync(slave_dev, failover_dev);
> dev_mc_unsync(slave_dev, failover_dev);
> dev_close(slave_dev);
> err_dev_open:
> - dev_put(slave_dev);
> dev_set_mtu(slave_dev, orig_mtu);
> done:
> - return err;
> + return;
> }
>
> -static int net_failover_slave_pre_unregister(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> +static void net_failover_setup(struct work_struct *w)
> {
> - struct net_device *standby_dev, *primary_dev;
> - struct net_failover_info *nfo_info;
> + struct net_failover_info *nfo_info
> + = container_of(w, struct net_failover_info, standby_init.work);
> + struct net_device *failover_dev = nfo_info->failover_dev;
>
> - nfo_info = netdev_priv(failover_dev);
> - primary_dev = rtnl_dereference(nfo_info->primary_dev);
> - standby_dev = rtnl_dereference(nfo_info->standby_dev);
> -
> - if (slave_dev != primary_dev && slave_dev != standby_dev)
> - return -ENODEV;
> + /* handle race with cancel delayed work on removal */
> + if (!rtnl_trylock()) {
> + schedule_delayed_work(&nfo_info->standby_init, 0);
> + return;
> + }
>
> - return 0;
> + __net_failover_setup(failover_dev);
> + rtnl_unlock();
> }
>
> -static int net_failover_slave_unregister(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> +static int net_failover_unregister_event(struct net_device *slave_dev)
> {
> - struct net_device *standby_dev, *primary_dev;
> + struct net_device *failover_dev, *primary_dev, *standby_dev;
> struct net_failover_info *nfo_info;
> bool slave_is_standby;
>
> + failover_dev = netdev_failover_upper_get(slave_dev);
> + if (!failover_dev)
> + return NOTIFY_DONE;
> +
> nfo_info = netdev_priv(failover_dev);
> primary_dev = rtnl_dereference(nfo_info->primary_dev);
> standby_dev = rtnl_dereference(nfo_info->standby_dev);
>
> + if (slave_dev != primary_dev && slave_dev != standby_dev)
> + return NOTIFY_DONE;
> +
> vlan_vids_del_by_dev(slave_dev, failover_dev);
> dev_uc_unsync(slave_dev, failover_dev);
> dev_mc_unsync(slave_dev, failover_dev);
> dev_close(slave_dev);
>
> - nfo_info = netdev_priv(failover_dev);
> dev_get_stats(failover_dev, &nfo_info->failover_stats);
>
> slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
> @@ -628,22 +700,25 @@ static int net_failover_slave_unregister(struct net_device *slave_dev,
> netdev_info(failover_dev, "failover %s slave:%s unregistered\n",
> slave_is_standby ? "standby" : "primary", slave_dev->name);
>
> - return 0;
> + return NOTIFY_OK;
> }
>
> -static int net_failover_slave_link_change(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> +static int net_failover_link_event(struct net_device *slave_dev)
> +
> {
> - struct net_device *primary_dev, *standby_dev;
> + struct net_device *failover_dev, *primary_dev, *standby_dev;
> struct net_failover_info *nfo_info;
>
> - nfo_info = netdev_priv(failover_dev);
> + failover_dev = netdev_failover_upper_get(slave_dev);
> + if (!failover_dev)
> + return NOTIFY_DONE;
>
> + nfo_info = netdev_priv(failover_dev);
> primary_dev = rtnl_dereference(nfo_info->primary_dev);
> standby_dev = rtnl_dereference(nfo_info->standby_dev);
>
> if (slave_dev != primary_dev && slave_dev != standby_dev)
> - return -ENODEV;
> + return NOTIFY_DONE;
>
> if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
> (standby_dev && net_failover_xmit_ready(standby_dev))) {
> @@ -657,43 +732,11 @@ static int net_failover_slave_link_change(struct net_device *slave_dev,
>
> net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
>
> - return 0;
> + return NOTIFY_DONE;
> }
>
> -static int net_failover_slave_name_change(struct net_device *slave_dev,
> - struct net_device *failover_dev)
> -{
> - struct net_device *primary_dev, *standby_dev;
> - struct net_failover_info *nfo_info;
> -
> - nfo_info = netdev_priv(failover_dev);
> -
> - primary_dev = rtnl_dereference(nfo_info->primary_dev);
> - standby_dev = rtnl_dereference(nfo_info->standby_dev);
> -
> - if (slave_dev != primary_dev && slave_dev != standby_dev)
> - return -ENODEV;
> -
> - /* We need to bring up the slave after the rename by udev in case
> - * open failed with EBUSY when it was registered.
> - */
> - dev_open(slave_dev);
> -
> - return 0;
> -}
> -
> -static struct failover_ops net_failover_ops = {
> - .slave_pre_register = net_failover_slave_pre_register,
> - .slave_register = net_failover_slave_register,
> - .slave_pre_unregister = net_failover_slave_pre_unregister,
> - .slave_unregister = net_failover_slave_unregister,
> - .slave_link_change = net_failover_slave_link_change,
> - .slave_name_change = net_failover_slave_name_change,
> - .slave_handle_frame = net_failover_handle_frame,
> -};
> -
> /**
> - * net_failover_create - Create and register a failover instance
> + * net_failover_create - Create and register a failover device
> *
> * @dev: standby netdev
> *
> @@ -703,13 +746,12 @@ static struct failover_ops net_failover_ops = {
> * the original standby netdev and a VF netdev with the same MAC gets
> * registered as primary netdev.
> *
> - * Return: pointer to failover instance
> + * Return: pointer to failover network device
> */
> -struct failover *net_failover_create(struct net_device *standby_dev)
> +struct net_device *net_failover_create(struct net_device *standby_dev)
> {
> - struct device *dev = standby_dev->dev.parent;
> + struct net_failover_info *nfo_info;
> struct net_device *failover_dev;
> - struct failover *failover;
> int err;
>
> /* Alloc at least 2 queues, for now we are going with 16 assuming
> @@ -717,18 +759,22 @@ struct failover *net_failover_create(struct net_device *standby_dev)
> */
> failover_dev = alloc_etherdev_mq(sizeof(struct net_failover_info), 16);
> if (!failover_dev) {
> - dev_err(dev, "Unable to allocate failover_netdev!\n");
> - return ERR_PTR(-ENOMEM);
> + netdev_err(standby_dev, "Unable to allocate failover_netdev!\n");
> + return NULL;
> }
>
> + nfo_info = netdev_priv(failover_dev);
> dev_net_set(failover_dev, dev_net(standby_dev));
> - SET_NETDEV_DEV(failover_dev, dev);
> + nfo_info->failover_dev = failover_dev;
> + INIT_DELAYED_WORK(&nfo_info->standby_init, net_failover_setup);
>
> failover_dev->netdev_ops = &failover_dev_ops;
> failover_dev->ethtool_ops = &failover_ethtool_ops;
>
> /* Initialize the device options */
> - failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
> + failover_dev->priv_flags |= IFF_UNICAST_FLT |
> + IFF_NO_QUEUE |
> + IFF_FAILOVER;
> failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
> IFF_TX_SKB_SHARING);
>
> @@ -746,29 +792,38 @@ struct failover *net_failover_create(struct net_device *standby_dev)
> failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
> failover_dev->features |= failover_dev->hw_features;
>
> - memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
> - failover_dev->addr_len);
> + ether_addr_copy(failover_dev->dev_addr, standby_dev->dev_addr);
> + ether_addr_copy(failover_dev->perm_addr, standby_dev->perm_addr);
>
> failover_dev->min_mtu = standby_dev->min_mtu;
> failover_dev->max_mtu = standby_dev->max_mtu;
>
> - err = register_netdev(failover_dev);
> + netif_carrier_off(failover_dev);
> +
> + rtnl_lock();
> + err = register_netdevice(failover_dev);
> if (err) {
> - dev_err(dev, "Unable to register failover_dev!\n");
> + netdev_err(standby_dev, "Unable to register failover_dev!\n");
> goto err_register_netdev;
> }
>
> - netif_carrier_off(failover_dev);
> + err = netdev_failover_join(standby_dev, failover_dev,
> + net_failover_handle_frame);
> + if (err) {
> + netdev_err(failover_dev, "Unable to join with %s\n",
> + standby_dev->name);
> + goto err_failover_join;
> + }
>
> - failover = failover_register(failover_dev, &net_failover_ops);
> - if (IS_ERR(failover))
> - goto err_failover_register;
> + list_add(&nfo_info->list, &net_failover_list);
> + rtnl_unlock();
>
> - return failover;
> + return failover_dev;
>
> -err_failover_register:
> - unregister_netdev(failover_dev);
> +err_failover_join:
> + unregister_netdevice(failover_dev);
> err_register_netdev:
> + rtnl_unlock();
> free_netdev(failover_dev);
>
> return ERR_PTR(err);
> @@ -786,31 +841,27 @@ EXPORT_SYMBOL_GPL(net_failover_create);
> * netdev. Used by paravirtual drivers that use 3-netdev model.
> *
> */
> -void net_failover_destroy(struct failover *failover)
> +void net_failover_destroy(struct net_device *failover_dev)
> {
> - struct net_failover_info *nfo_info;
> - struct net_device *failover_dev;
> + struct net_failover_info *nfo_info = netdev_priv(failover_dev);
> struct net_device *slave_dev;
>
> - if (!failover)
> - return;
> -
> - failover_dev = rcu_dereference(failover->failover_dev);
> - nfo_info = netdev_priv(failover_dev);
> -
> netif_device_detach(failover_dev);
>
> rtnl_lock();
> -
> slave_dev = rtnl_dereference(nfo_info->primary_dev);
> - if (slave_dev)
> - failover_slave_unregister(slave_dev);
> + if (slave_dev) {
> + netdev_failover_unjoin(slave_dev, failover_dev);
> + dev_put(slave_dev);
> + }
>
> slave_dev = rtnl_dereference(nfo_info->standby_dev);
> - if (slave_dev)
> - failover_slave_unregister(slave_dev);
> + if (slave_dev) {
> + netdev_failover_unjoin(slave_dev, failover_dev);
> + dev_put(slave_dev);
> + }
>
> - failover_unregister(failover);
> + list_del(&nfo_info->list);
>
> unregister_netdevice(failover_dev);
>
> @@ -820,9 +871,53 @@ void net_failover_destroy(struct failover *failover)
> }
> EXPORT_SYMBOL_GPL(net_failover_destroy);
>
> +static int net_failover_event(struct notifier_block *this,
> + unsigned long event, void *ptr)
> +{
> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> + /* Skip parent events */
> + if (netif_is_failover(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid non-Ethernet type devices */
> + if (event_dev->type != ARPHRD_ETHER)
> + return NOTIFY_DONE;
> +
> + /* Avoid Vlan dev with same MAC registering as VF */
> + if (is_vlan_dev(event_dev))
> + return NOTIFY_DONE;
> +
> + /* Avoid Bonding master dev with same MAC registering as VF */
> + if ((event_dev->priv_flags & IFF_BONDING) &&
> + (event_dev->flags & IFF_MASTER))
> + return NOTIFY_DONE;
> +
> + switch (event) {
> + case NETDEV_REGISTER:
> + return net_failover_register_event(event_dev);
> +
> + case NETDEV_UNREGISTER:
> + return net_failover_unregister_event(event_dev);
> +
> + case NETDEV_UP:
> + case NETDEV_DOWN:
> + case NETDEV_CHANGE:
> + return net_failover_link_event(event_dev);
> +
> + default:
> + return NOTIFY_DONE;
> + }
> +}
> +
> +static struct notifier_block net_failover_notifier = {
> + .notifier_call = net_failover_event,
> +};
> +
> static __init int
> net_failover_init(void)
> {
> + register_netdevice_notifier(&net_failover_notifier);
> return 0;
> }
> module_init(net_failover_init);
> @@ -830,6 +925,7 @@ module_init(net_failover_init);
> static __exit
> void net_failover_exit(void)
> {
> + unregister_netdevice_notifier(&net_failover_notifier);
> }
> module_exit(net_failover_exit);
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6d710b8b41c5..b40ae28dac93 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -215,7 +215,7 @@ struct virtnet_info {
> unsigned long guest_offloads;
>
> /* failover when STANDBY feature enabled */
> - struct failover *failover;
> + struct net_device *failover;
> };
>
> struct padded_vnet_hdr {
> @@ -2930,11 +2930,10 @@ static int virtnet_probe(struct virtio_device *vdev)
> virtnet_init_settings(dev);
>
> if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
> - vi->failover = net_failover_create(vi->dev);
> - if (IS_ERR(vi->failover)) {
> - err = PTR_ERR(vi->failover);
> + err = -ENOMEM;
> + vi->failover = net_failover_create(dev);
> + if (!vi->failover)
> goto free_vqs;
> - }
> }
>
> err = register_netdev(dev);
> diff --git a/include/net/failover.h b/include/net/failover.h
> index bb15438f39c7..22d6c1369101 100644
> --- a/include/net/failover.h
> +++ b/include/net/failover.h
> @@ -6,31 +6,10 @@
>
> #include <linux/netdevice.h>
>
> -struct failover_ops {
> - int (*slave_pre_register)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - int (*slave_register)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - int (*slave_pre_unregister)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - int (*slave_unregister)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - int (*slave_link_change)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - int (*slave_name_change)(struct net_device *slave_dev,
> - struct net_device *failover_dev);
> - rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb);
> -};
> -
> -struct failover {
> - struct list_head list;
> - struct net_device __rcu *failover_dev;
> - struct failover_ops __rcu *ops;
> -};
> -
> -struct failover *failover_register(struct net_device *dev,
> - struct failover_ops *ops);
> -void failover_unregister(struct failover *failover);
> -int failover_slave_unregister(struct net_device *slave_dev);
> +int netdev_failover_join(struct net_device *lower, struct net_device *upper,
> + rx_handler_func_t *rx_handler);
> +struct net_device *netdev_failover_upper_get(struct net_device *lower);
> +void netdev_failover_unjoin(struct net_device *lower,
> + struct net_device *upper);
>
> #endif /* _FAILOVER_H */
> diff --git a/include/net/net_failover.h b/include/net/net_failover.h
> index b12a1c469d1c..a99b3b00b4e3 100644
> --- a/include/net/net_failover.h
> +++ b/include/net/net_failover.h
> @@ -6,35 +6,7 @@
>
> #include <net/failover.h>
>
> -/* failover state */
> -struct net_failover_info {
> - /* primary netdev with same MAC */
> - struct net_device __rcu *primary_dev;
> -
> - /* standby netdev */
> - struct net_device __rcu *standby_dev;
> -
> - /* primary netdev stats */
> - struct rtnl_link_stats64 primary_stats;
> -
> - /* standby netdev stats */
> - struct rtnl_link_stats64 standby_stats;
> -
> - /* aggregated stats */
> - struct rtnl_link_stats64 failover_stats;
> -
> - /* spinlock while updating stats */
> - spinlock_t stats_lock;
> -};
> -
> -struct failover *net_failover_create(struct net_device *standby_dev);
> -void net_failover_destroy(struct failover *failover);
> -
> -#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
> - NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
> - NETIF_F_HIGHDMA | NETIF_F_LRO)
> -
> -#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
> - NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
> +struct net_device *net_failover_create(struct net_device *standby_dev);
> +void net_failover_destroy(struct net_device *failover_dev);
>
> #endif /* _NET_FAILOVER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index f738a6f27665..697d84202695 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -433,17 +433,8 @@ config PAGE_POOL
> bool
>
> config FAILOVER
> - tristate "Generic failover module"
> - help
> - The failover module provides a generic interface for paravirtual
> - drivers to register a netdev and a set of ops with a failover
> - instance. The ops are used as event handlers that get called to
> - handle netdev register/unregister/link change/name change events
> - on slave pci ethernet devices with the same mac address as the
> - failover netdev. This enables paravirtual drivers to use a
> - VF as an accelerated low latency datapath. It also allows live
> - migration of VMs with direct attached VFs by failing over to the
> - paravirtual datapath when the VF is unplugged.
> + bool
> + default n
>
> endif # if NET
>
> diff --git a/net/core/failover.c b/net/core/failover.c
> index 4a92a98ccce9..499f0fd7e4d3 100644
> --- a/net/core/failover.c
> +++ b/net/core/failover.c
> @@ -1,10 +1,8 @@
> // SPDX-License-Identifier: GPL-2.0
> /* Copyright (c) 2018, Intel Corporation. */
>
> -/* A common module to handle registrations and notifications for paravirtual
> +/* A library for managing chained upper/oower devices such as
> * drivers to enable accelerated datapath and support VF live migration.
> - *
> - * The notifier and event handling code is based on netvsc driver.
> */
>
> #include <linux/module.h>
> @@ -14,302 +12,62 @@
> #include <linux/if_vlan.h>
> #include <net/failover.h>
>
> -static LIST_HEAD(failover_list);
> -static DEFINE_SPINLOCK(failover_lock);
> -
> -static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
> -{
> - struct net_device *failover_dev;
> - struct failover *failover;
> -
> - spin_lock(&failover_lock);
> - list_for_each_entry(failover, &failover_list, list) {
> - failover_dev = rtnl_dereference(failover->failover_dev);
> - if (ether_addr_equal(failover_dev->perm_addr, mac)) {
> - *ops = rtnl_dereference(failover->ops);
> - spin_unlock(&failover_lock);
> - return failover_dev;
> - }
> - }
> - spin_unlock(&failover_lock);
> - return NULL;
> -}
> -
> -/**
> - * failover_slave_register - Register a slave netdev
> - *
> - * @slave_dev: slave netdev that is being registered
> - *
> - * Registers a slave device to a failover instance. Only ethernet devices
> - * are supported.
> - */
> -static int failover_slave_register(struct net_device *slave_dev)
> +/* failover_join - Join an lower netdev with an upper device. */
> +int netdev_failover_join(struct net_device *lower_dev,
> + struct net_device *upper_dev,
> + rx_handler_func_t *rx_handler)
> {
> - struct netdev_lag_upper_info lag_upper_info;
> - struct net_device *failover_dev;
> - struct failover_ops *fops;
> int err;
>
> - if (slave_dev->type != ARPHRD_ETHER)
> - goto done;
> -
> ASSERT_RTNL();
>
> - failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
> - if (!failover_dev)
> - goto done;
> + /* Don't allow joining devices of different protocols */
> + if (upper_dev->type != lower_dev->type)
> + return -EINVAL;
>
> - if (fops && fops->slave_pre_register &&
> - fops->slave_pre_register(slave_dev, failover_dev))
> - goto done;
> -
> - err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
> - failover_dev);
> + err = netdev_rx_handler_register(lower_dev, rx_handler, upper_dev);
> if (err) {
> - netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
> + netdev_err(lower_dev,
> + "can not register failover rx handler (err = %d)\n",
> err);
> - goto done;
> + return err;
> }
>
> - lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
> - err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
> - &lag_upper_info, NULL);
> + err = netdev_master_upper_dev_link(lower_dev, upper_dev, NULL,
> + NULL, NULL);
> if (err) {
> - netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
> - failover_dev->name, err);
> - goto err_upper_link;
> + netdev_err(lower_dev,
> + "can not set failover device %s (err = %d)\n",
> + upper_dev->name, err);
> + netdev_rx_handler_unregister(lower_dev);
> + return err;
> }
>
> - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
> -
> - if (fops && fops->slave_register &&
> - !fops->slave_register(slave_dev, failover_dev))
> - return NOTIFY_OK;
> -
> - netdev_upper_dev_unlink(slave_dev, failover_dev);
> - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
> -err_upper_link:
> - netdev_rx_handler_unregister(slave_dev);
> -done:
> - return NOTIFY_DONE;
> -}
> -
> -/**
> - * failover_slave_unregister - Unregister a slave netdev
> - *
> - * @slave_dev: slave netdev that is being unregistered
> - *
> - * Unregisters a slave device from a failover instance.
> - */
> -int failover_slave_unregister(struct net_device *slave_dev)
> -{
> - struct net_device *failover_dev;
> - struct failover_ops *fops;
> -
> - if (!netif_is_failover_slave(slave_dev))
> - goto done;
> -
> - ASSERT_RTNL();
> -
> - failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
> - if (!failover_dev)
> - goto done;
> -
> - if (fops && fops->slave_pre_unregister &&
> - fops->slave_pre_unregister(slave_dev, failover_dev))
> - goto done;
> -
> - netdev_rx_handler_unregister(slave_dev);
> - netdev_upper_dev_unlink(slave_dev, failover_dev);
> - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
> -
> - if (fops && fops->slave_unregister &&
> - !fops->slave_unregister(slave_dev, failover_dev))
> - return NOTIFY_OK;
> -
> -done:
> - return NOTIFY_DONE;
> + dev_hold(lower_dev);
> + lower_dev->priv_flags |= IFF_FAILOVER_SLAVE;
> + return 0;
> }
> -EXPORT_SYMBOL_GPL(failover_slave_unregister);
> +EXPORT_SYMBOL_GPL(netdev_failover_join);
>
> -static int failover_slave_link_change(struct net_device *slave_dev)
> +/* Find upper network device for failover slave device */
> +struct net_device *netdev_failover_upper_get(struct net_device *lower_dev)
> {
> - struct net_device *failover_dev;
> - struct failover_ops *fops;
> -
> - if (!netif_is_failover_slave(slave_dev))
> - goto done;
> -
> - ASSERT_RTNL();
> -
> - failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
> - if (!failover_dev)
> - goto done;
> -
> - if (!netif_running(failover_dev))
> - goto done;
> + if (!netif_is_failover_slave(lower_dev))
> + return NULL;
>
> - if (fops && fops->slave_link_change &&
> - !fops->slave_link_change(slave_dev, failover_dev))
> - return NOTIFY_OK;
> -
> -done:
> - return NOTIFY_DONE;
> + return netdev_master_upper_dev_get(lower_dev);
> }
> +EXPORT_SYMBOL_GPL(netdev_failover_upper_get);
>
> -static int failover_slave_name_change(struct net_device *slave_dev)
> +/* failover_unjoin - Break connection between lower and upper device. */
> +void netdev_failover_unjoin(struct net_device *lower_dev,
> + struct net_device *upper_dev)
> {
> - struct net_device *failover_dev;
> - struct failover_ops *fops;
> -
> - if (!netif_is_failover_slave(slave_dev))
> - goto done;
> -
> ASSERT_RTNL();
>
> - failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
> - if (!failover_dev)
> - goto done;
> -
> - if (!netif_running(failover_dev))
> - goto done;
> -
> - if (fops && fops->slave_name_change &&
> - !fops->slave_name_change(slave_dev, failover_dev))
> - return NOTIFY_OK;
> -
> -done:
> - return NOTIFY_DONE;
> -}
> -
> -static int
> -failover_event(struct notifier_block *this, unsigned long event, void *ptr)
> -{
> - struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> -
> - /* Skip parent events */
> - if (netif_is_failover(event_dev))
> - return NOTIFY_DONE;
> -
> - switch (event) {
> - case NETDEV_REGISTER:
> - return failover_slave_register(event_dev);
> - case NETDEV_UNREGISTER:
> - return failover_slave_unregister(event_dev);
> - case NETDEV_UP:
> - case NETDEV_DOWN:
> - case NETDEV_CHANGE:
> - return failover_slave_link_change(event_dev);
> - case NETDEV_CHANGENAME:
> - return failover_slave_name_change(event_dev);
> - default:
> - return NOTIFY_DONE;
> - }
> -}
> -
> -static struct notifier_block failover_notifier = {
> - .notifier_call = failover_event,
> -};
> -
> -static void
> -failover_existing_slave_register(struct net_device *failover_dev)
> -{
> - struct net *net = dev_net(failover_dev);
> - struct net_device *dev;
> -
> - rtnl_lock();
> - for_each_netdev(net, dev) {
> - if (netif_is_failover(dev))
> - continue;
> - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
> - failover_slave_register(dev);
> - }
> - rtnl_unlock();
> -}
> -
> -/**
> - * failover_register - Register a failover instance
> - *
> - * @dev: failover netdev
> - * @ops: failover ops
> - *
> - * Allocate and register a failover instance for a failover netdev. ops
> - * provides handlers for slave device register/unregister/link change/
> - * name change events.
> - *
> - * Return: pointer to failover instance
> - */
> -struct failover *failover_register(struct net_device *dev,
> - struct failover_ops *ops)
> -{
> - struct failover *failover;
> -
> - if (dev->type != ARPHRD_ETHER)
> - return ERR_PTR(-EINVAL);
> -
> - failover = kzalloc(sizeof(*failover), GFP_KERNEL);
> - if (!failover)
> - return ERR_PTR(-ENOMEM);
> -
> - rcu_assign_pointer(failover->ops, ops);
> - dev_hold(dev);
> - dev->priv_flags |= IFF_FAILOVER;
> - rcu_assign_pointer(failover->failover_dev, dev);
> -
> - spin_lock(&failover_lock);
> - list_add_tail(&failover->list, &failover_list);
> - spin_unlock(&failover_lock);
> -
> - netdev_info(dev, "failover master:%s registered\n", dev->name);
> -
> - failover_existing_slave_register(dev);
> -
> - return failover;
> -}
> -EXPORT_SYMBOL_GPL(failover_register);
> -
> -/**
> - * failover_unregister - Unregister a failover instance
> - *
> - * @failover: pointer to failover instance
> - *
> - * Unregisters and frees a failover instance.
> - */
> -void failover_unregister(struct failover *failover)
> -{
> - struct net_device *failover_dev;
> -
> - failover_dev = rcu_dereference(failover->failover_dev);
> -
> - netdev_info(failover_dev, "failover master:%s unregistered\n",
> - failover_dev->name);
> -
> - failover_dev->priv_flags &= ~IFF_FAILOVER;
> - dev_put(failover_dev);
> -
> - spin_lock(&failover_lock);
> - list_del(&failover->list);
> - spin_unlock(&failover_lock);
> -
> - kfree(failover);
> + netdev_rx_handler_unregister(lower_dev);
> + netdev_upper_dev_unlink(lower_dev, upper_dev);
> + dev_put(lower_dev);
> + lower_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
> }
> -EXPORT_SYMBOL_GPL(failover_unregister);
> -
> -static __init int
> -failover_init(void)
> -{
> - register_netdevice_notifier(&failover_notifier);
> -
> - return 0;
> -}
> -module_init(failover_init);
> -
> -static __exit
> -void failover_exit(void)
> -{
> - unregister_netdevice_notifier(&failover_notifier);
> -}
> -module_exit(failover_exit);
> -
> -MODULE_DESCRIPTION("Generic failover infrastructure/interface");
> -MODULE_LICENSE("GPL v2");
> +EXPORT_SYMBOL_GPL(netdev_failover_unjoin);
^ permalink raw reply
* Re: [PATCH] r8169: Reinstate ALDPS and ASPM support
From: Bjorn Helgaas @ 2018-06-05 17:28 UTC (permalink / raw)
To: Ryankao
Cc: Kai Heng Feng, jrg.otte@gmail.com, David Miller, Hayes Wang,
hkallweit1@gmail.com, romieu@fr.zoreil.com, Linux Netdev List,
Linux Kernel Mailing List, Hau
In-Reply-To: <B01FA80E1422A647BEB597328C544223D30B8F90@RTITMBSV05.realtek.com.tw>
On Tue, Jun 05, 2018 at 06:34:09AM +0000, Ryankao wrote:
> Add realtek folk Hau
>
> -----Original Message-----
> From: Kai Heng Feng [mailto:kai.heng.feng@canonical.com]
> Sent: Tuesday, June 05, 2018 1:02 PM
> To: jrg.otte@gmail.com
> Cc: David Miller <davem@davemloft.net>; Hayes Wang <hayeswang@realtek.com>; hkallweit1@gmail.com; romieu@fr.zoreil.com; Linux Netdev List <netdev@vger.kernel.org>; Linux Kernel Mailing List <linux-kernel@vger.kernel.org>; Ryankao <ryankao@realtek.com>
> Subject: Re: [PATCH] r8169: Reinstate ALDPS and ASPM support
>
> Hi Jörg Otte,
>
> Can you give this patch a try?
>
> Since you are the only one that reported ALDPS/ASPM regression,
>
> And I think this patch should solve the issue you had [1].
>
> Hopefully we don't need to go down the rabbit hole of blacklist/whitelist...
>
> Kai-Heng
>
> [1] https://lkml.org/lkml/2013/1/5/36
I have no idea what ALDPS is. It's not mentioned in the PCIe spec, so
presumably it's some Realtek-specific thing. ASPM is a generic PCIe
thing. Changes to these two things should be in separate patches so
they don't get tangled up.
> > On Jun 5, 2018, at 12:58 PM, Kai-Heng Feng
> > <kai.heng.feng@canonical.com>
> > wrote:
> >
> > This patch reinstate ALDPS and ASPM support on r8169.
> >
> > On some Intel platforms, ASPM support on r8169 is the key factor to
> > let Package C-State achieve PC8. Without ASPM support, the deepest
> > Package C-State can hit is PC3. PC8 can save additional ~3W in
> > comparison with PC3.
> >
> > This patch is from Realtek.
> >
> > Fixes: e0c075577965 ("r8169: enable ALDPS for power saving")
> > Fixes: d64ec841517a ("r8169: enable internal ASPM and clock request
> > settings")
> > +3507,15 @@ static void rtl8168e_1_hw_phy_config(struct
> > rtl8169_private *tp)
> > rtl_writephy(tp, 0x0d, 0x4007);
> > rtl_writephy(tp, 0x0e, 0x0000);
> > rtl_writephy(tp, 0x0d, 0x0000);
> > +
> > + /* Check ALDPS bit, disable it if enabled */
> > + rtl_writephy(tp, 0x1f, 0x0000);
> > + if (enable_aldps)
> > + rtl_w0w1_phy(tp, 0x15, 0x1000, 0x0000);
> > + else if (rtl_readphy(tp, 0x15) & 0x1000)
> > + rtl_w0w1_phy(tp, 0x15, 0x0000, 0x1000);
There's a lot of repetition of this code with minor variations. You
could probably factor it out and make it more concise and more
readable.
> > +static void rtl8169_check_link_status(struct net_device *dev,
> > + struct rtl8169_private *tp) {
> > + struct device *d = tp_to_dev(tp);
> > +
> > + if (tp->link_ok(tp)) {
> > + rtl_link_chg_patch(tp);
> > + /* This is to cancel a scheduled suspend if there's one. */
> > + if (pm_request_resume(d))
> > + _rtl_reset_work(tp);
> > + netif_carrier_on(dev);
> > + if (net_ratelimit())
> > + netif_info(tp, ifup, dev, "link up\n");
> > + } else {
> > + netif_carrier_off(dev);
> > + netif_info(tp, ifdown, dev, "link down\n");
> > + pm_runtime_idle(d);
> > + }
> > +}
This function apparently just got moved around without changing
anything. That's fine, but the move should be in a separate patch to
make the real changes easier to review.
> > @@ -7649,8 +7757,12 @@ static int rtl_init_one(struct pci_dev *pdev,
> > const struct pci_device_id *ent)
> >
> > /* disable ASPM completely as that cause random device stop working
> > * problems as well as full system hangs for some PCIe devices users */
> > - pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1 |
> > - PCIE_LINK_STATE_CLKPM);
> > + if (!enable_aspm) {
> > + pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S |
> > + PCIE_LINK_STATE_L1 |
> > + PCIE_LINK_STATE_CLKPM);
> > + netif_info(tp, probe, dev, "ASPM disabled\n");
> > + }
ASPM is a generic PCIe feature that should be configured by the PCI
core without any help from the device driver.
If code in the driver is needed, that means either the PCI core is
doing it wrong and we should fix it there, or the device is broken and
the driver is working around the erratum.
If this is an erratum, you should include details about exactly what's
broken and (ideally) a URL to the published erratum. Otherwise this
is just unmaintainable black magic and likely to be broken by future
ASPM changes in the PCI core.
ASPM configuration is done by the PCI core before drivers are bound to
the device. If you need device-specific workarounds, they should
probably be in quirks so they're done before the core does that ASPM
configuration.
> > /* enable device (incl. PCI PM wakeup and hotplug setup) */
> > rc = pcim_enable_device(pdev);
> > --
> > 2.17.0
>
> ------Please consider the environment before printing this e-mail.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox