* [PATCH net-next v3 1/2] inet: add bound ports statistic
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
To: davem, gerrit, kuznet, yoshfuji
Cc: netdev, dccp, Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180507184333.32688-1-sthemmin@microsoft.com>
This adds a number of bound ports which fixes socket summary
command. The ss -s has been broken since changes to slab info
and this is one way to recover the missing value by adding a
field onto /proc/net/sockstat.
Since this is an informational value only, there is no need
for locking.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
include/net/inet_hashtables.h | 3 +++
include/net/inet_timewait_sock.h | 2 ++
net/dccp/proto.c | 1 +
net/ipv4/inet_hashtables.c | 22 +++++++++++++++++++---
net/ipv4/inet_timewait_sock.c | 8 +++++---
net/ipv4/proc.c | 5 +++--
net/ipv4/tcp.c | 1 +
7 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..b02524e2571e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -103,6 +103,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
struct inet_bind_hashbucket {
spinlock_t lock;
+ unsigned int count;
struct hlist_head chain;
};
@@ -193,7 +194,9 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+ struct inet_bind_hashbucket *head,
struct inet_bind_bucket *tb);
+unsigned int inet_bind_bucket_count(const struct proto *prot);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..4cdb8034ad80 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -87,7 +87,9 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
void inet_twsk_free(struct inet_timewait_sock *tw);
void inet_twsk_put(struct inet_timewait_sock *tw);
+struct inet_bind_hashbucket;
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_bind_hashbucket *head,
struct inet_hashinfo *hashinfo);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 84cd4e3fd01b..25f03e62cfea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1208,6 +1208,7 @@ static int __init dccp_init(void)
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ dccp_hashinfo.bhash[i].count = 0;
}
rc = dccp_mib_init();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..8ba6b17d95d5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,18 @@ static u32 sk_ehashfn(const struct sock *sk)
sk->sk_daddr, sk->sk_dport);
}
+/* Count how many any entries are in the bind hash table */
+unsigned int inet_bind_bucket_count(const struct proto *prot)
+{
+ const struct inet_hashinfo *hinfo = prot->h.hashinfo;
+ unsigned int i, ports = 0;
+
+ for (i = 0; i < hinfo->bhash_size; i++)
+ ports += hinfo->bhash[i].count;
+
+ return ports;
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -76,6 +88,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->fastreuseport = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
+ ++head->count;
}
return tb;
}
@@ -83,10 +96,13 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
+ --head->count;
kmem_cache_free(cachep, tb);
}
}
@@ -115,7 +131,7 @@ static void __inet_put_port(struct sock *sk)
__sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, head, tb);
spin_unlock(&head->lock);
}
@@ -756,7 +772,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, head, hinfo);
spin_unlock(&head->lock);
if (tw)
inet_twsk_deschedule_put(tw);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 88c5069b5d20..dd888c52f958 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -26,7 +26,8 @@
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
- struct inet_hashinfo *hashinfo)
+ struct inet_bind_hashbucket *head,
+ struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
@@ -35,7 +36,8 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep,
+ head, tb);
__sock_put((struct sock *)tw);
}
@@ -55,7 +57,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, bhead, hashinfo);
spin_unlock(&bhead->lock);
atomic_dec(&tw->tw_dr->tw_count);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..83bc9a0f2785 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -60,10 +60,11 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
socket_seq_show(seq);
- seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld ports %u\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ proto_memory_allocated(&tcp_prot),
+ inet_bind_bucket_count(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 868ed74a76a8..f62e2fb02fdf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3836,6 +3836,7 @@ void __init tcp_init(void)
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ tcp_hashinfo.bhash[i].count = 0;
}
--
2.17.0
^ permalink raw reply related
* [PATCH net-next v3 2/2] socket: keep track of the number of sockets allocated
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
To: davem, gerrit, kuznet, yoshfuji
Cc: netdev, dccp, Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180507184333.32688-1-sthemmin@microsoft.com>
Add a per-cpu counter to keep track of the number of inodes allocated
to sockets to fix incorrect statistics from ss command.
The ss command tries to keep track of the number of sockets
allocated but it was doing by the slabinfo statistics which are
wrong (due to merging) and not available when using slub.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
net/socket.c | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/net/socket.c b/net/socket.c
index f10f1d947c78..89ec7f41559d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -234,6 +234,18 @@ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
}
static struct kmem_cache *sock_inode_cachep __ro_after_init;
+static unsigned int __percpu *sock_pcpu_allocated;
+
+static unsigned int sock_allocated(void)
+{
+ unsigned int res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ res += *per_cpu_ptr(sock_pcpu_allocated, cpu);
+
+ return res;
+}
static struct inode *sock_alloc_inode(struct super_block *sb)
{
@@ -248,6 +260,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
+ this_cpu_inc(*sock_pcpu_allocated);
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
wq->flags = 0;
@@ -270,6 +283,7 @@ static void sock_destroy_inode(struct inode *inode)
ei = container_of(inode, struct socket_alloc, vfs_inode);
wq = rcu_dereference_protected(ei->socket.wq, 1);
kfree_rcu(wq, rcu);
+ this_cpu_dec(*sock_pcpu_allocated);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -290,6 +304,8 @@ static void init_inodecache(void)
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
init_once);
BUG_ON(sock_inode_cachep == NULL);
+ sock_pcpu_allocated = alloc_percpu(unsigned int);
+ BUG_ON(sock_pcpu_allocated == NULL);
}
static const struct super_operations sockfs_ops = {
@@ -2738,8 +2754,9 @@ core_initcall(sock_init); /* early initcall */
#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
- seq_printf(seq, "sockets: used %d\n",
- sock_inuse_get(seq->private));
+ seq_printf(seq, "sockets: used %d allocated %u\n",
+ sock_inuse_get(seq->private),
+ sock_allocated());
}
#endif /* CONFIG_PROC_FS */
--
2.17.0
^ permalink raw reply related
* Re: [net-next PATCH v3 5/6] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Willem de Bruijn @ 2018-05-07 18:43 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180846.3486.35452.stgit@localhost.localdomain>
On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch adds support for a software provided checksum and GSO_PARTIAL
> segmentation support. With this we can offload UDP segmentation on devices
> that only have partial support for tunnels.
>
> Since we are no longer needing the hardware checksum we can drop the checks
> in the segmentation code that were verifying if it was present.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
^ permalink raw reply
* Re: [PATCH 01/18] docs: can.rst: fix a footnote reference
From: Oliver Hartkopp @ 2018-05-07 18:41 UTC (permalink / raw)
To: Mauro Carvalho Chehab, Linux Doc Mailing List, Robert Schwebel
Cc: Mauro Carvalho Chehab, linux-kernel, Jonathan Corbet,
Marc Kleine-Budde, David S. Miller, linux-can, netdev
In-Reply-To: <2a04ab24f302a0802572c4c80c4d416b953f213d.1525684985.git.mchehab+samsung@kernel.org>
+ Robert Schwebel (who thankfully did the txt -> rst conversion for can.txt)
On 05/07/2018 11:35 AM, Mauro Carvalho Chehab wrote:
> As stated at:
> http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#footnotes
>
> A footnote should contain either a number, a reference or
> an auto number, e. g.:
> [1], [#f1] or [#].
>
> While using [*] accidentaly works for html, it fails for other
> document outputs. In particular, it causes an error with LaTeX
> output, causing all books after networking to not be built.
>
> So, replace it by a valid syntax.
>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Thanks Mauro!
Best regards,
Oliver
> ---
> Documentation/networking/can.rst | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
> index d23c51abf8c6..2fd0b51a8c52 100644
> --- a/Documentation/networking/can.rst
> +++ b/Documentation/networking/can.rst
> @@ -164,7 +164,7 @@ The Linux network devices (by default) just can handle the
> transmission and reception of media dependent frames. Due to the
> arbitration on the CAN bus the transmission of a low prio CAN-ID
> may be delayed by the reception of a high prio CAN frame. To
> -reflect the correct [*]_ traffic on the node the loopback of the sent
> +reflect the correct [#f1]_ traffic on the node the loopback of the sent
> data has to be performed right after a successful transmission. If
> the CAN network interface is not capable of performing the loopback for
> some reason the SocketCAN core can do this task as a fallback solution.
> @@ -175,7 +175,7 @@ networking behaviour for CAN applications. Due to some requests from
> the RT-SocketCAN group the loopback optionally may be disabled for each
> separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`.
>
> -.. [*] you really like to have this when you're running analyser
> +.. [#f1] you really like to have this when you're running analyser
> tools like 'candump' or 'cansniffer' on the (same) node.
>
>
>
^ permalink raw reply
* Re: [net-next PATCH v3 3/6] udp: Do not pass checksum as a parameter to GSO segmentation
From: Willem de Bruijn @ 2018-05-07 18:49 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180834.3486.87816.stgit@localhost.localdomain>
On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch is meant to allow us to avoid having to recompute the checksum
> from scratch and have it passed as a parameter.
>
> Instead of taking that approach we can take advantage of the fact that the
> length that was used to compute the existing checksum is included in the
> UDP header.
>
> Finally to avoid the need to invert the result we can just call csum16_add
> and csum16_sub directly. By doing this we can avoid a number of
> instructions in the loop that is handling segmentation.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Small aside: instead of open-coding the csum16 operations,
it might make sense to define a pseudo_csum_replace2 function
and convert csum_replace2 to call that and only do the inversion.
^ permalink raw reply
* Re: [PATCH v2 net-next 2/4] net: add skeleton of bpfilter kernel module
From: Luis R. Rodriguez @ 2018-05-07 18:51 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: davem, daniel, torvalds, gregkh, luto, netdev, linux-kernel,
kernel-team, Juergen Gross, Eric Paris, Matthew Auld,
Josh Triplett, Kirill A. Shutemov, Joonas Lahtinen, Chris Wilson,
Stephen Smalley, Eric W. Biederman, Mimi Zohar, David Howells,
Kees Cook, Andrew Morton, Dominik Brodowski
In-Reply-To: <20180503043604.1604587-3-ast@kernel.org>
On Wed, May 02, 2018 at 09:36:02PM -0700, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
>
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
> is converted into bpfilter_umh.o object file
> with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
> Example:
> $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
> 0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
> 0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
> 0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
>
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
>
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.
It does this very fast, right away. On a really slow system how are you sure
that this won't race and the execution of the check happens early on prior to
letting the actual setup trigger? After all, we're calling the userpsace
process in async mode. We could preempt it now.
> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
>
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> include/linux/bpfilter.h | 15 +++++++
> include/uapi/linux/bpfilter.h | 21 ++++++++++
> net/Kconfig | 2 +
> net/Makefile | 1 +
> net/bpfilter/Kconfig | 17 ++++++++
> net/bpfilter/Makefile | 24 +++++++++++
> net/bpfilter/bpfilter_kern.c | 93 +++++++++++++++++++++++++++++++++++++++++++
> net/bpfilter/main.c | 63 +++++++++++++++++++++++++++++
> net/bpfilter/msgfmt.h | 17 ++++++++
> net/ipv4/Makefile | 2 +
> net/ipv4/bpfilter/Makefile | 2 +
> net/ipv4/bpfilter/sockopt.c | 42 +++++++++++++++++++
> net/ipv4/ip_sockglue.c | 17 ++++++++
> 13 files changed, 316 insertions(+)
> create mode 100644 include/linux/bpfilter.h
> create mode 100644 include/uapi/linux/bpfilter.h
> create mode 100644 net/bpfilter/Kconfig
> create mode 100644 net/bpfilter/Makefile
> create mode 100644 net/bpfilter/bpfilter_kern.c
> create mode 100644 net/bpfilter/main.c
> create mode 100644 net/bpfilter/msgfmt.h
> create mode 100644 net/ipv4/bpfilter/Makefile
> create mode 100644 net/ipv4/bpfilter/sockopt.c
>
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> + unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> + int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> + BPFILTER_IPT_SO_SET_REPLACE = 64,
> + BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> + BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> + BPFILTER_IPT_SO_GET_INFO = 64,
> + BPFILTER_IPT_SO_GET_ENTRIES = 65,
> + BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> + BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> + BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>
> endif
>
> +source "net/bpfilter/Kconfig"
> +
> source "net/dccp/Kconfig"
> source "net/sctp/Kconfig"
> source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
> obj-$(CONFIG_XFRM) += xfrm/
> obj-$(CONFIG_UNIX) += unix/
> obj-$(CONFIG_NET) += ipv6/
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> obj-$(CONFIG_PACKET) += packet/
> obj-$(CONFIG_NET_KEY) += key/
> obj-$(CONFIG_BRIDGE) += bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> + bool "BPF based packet filtering framework (BPFILTER)"
> + default n
> + depends on NET && BPF
> + help
> + This builds experimental bpfilter framework that is aiming to
> + provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> + tristate "bpftiler kernel module with user mode helper"
> + default m
> + depends on m
> + help
> + This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> + cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> + $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> + -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> + --rename-section .data=.init.rodata $< $@
Cool, but so our expectation is that the compiler sets this symbol, how
are we sure it will always be set?
> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> + $(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> + struct task_struct *tsk;
> +
> + tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> + if (tsk)
> + force_sig(SIGKILL, tsk);
> + fput(info->pipe_to_umh);
> + fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> + if (bpfilter_process_sockopt) {
> + bpfilter_process_sockopt = NULL;
> + shutdown_umh(&info);
> + }
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set)
> +{
> + struct mbox_request req;
> + struct mbox_reply reply;
> + loff_t pos;
> + ssize_t n;
> +
> + req.is_set = is_set;
> + req.pid = current->pid;
> + req.cmd = optname;
> + req.addr = (long)optval;
> + req.len = optlen;
> + n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> + if (n != sizeof(req)) {
> + pr_err("write fail %zd\n", n);
> + stop_umh();
> + return -EFAULT;
> + }
> + pos = 0;
> + n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> + if (n != sizeof(reply)) {
> + pr_err("read fail %zd\n", n);
> + stop_umh();
> + return -EFAULT;
> + }
> + return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> + int err;
> +
> + err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> + if (err)
> + return err;
> + pr_info("Loaded umh pid %d\n", info.pid);
> + bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> + if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
See, here, what if the userspace process gets preemtped and we run this
check afterwards? Is that possible?
Luis
> + stop_umh();
> + return -EFAULT;
> + }
> + return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> + stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> + switch (cmd->cmd) {
> + case 0:
> + return 0;
> + default:
> + break;
> + }
> + return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> + return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> + while (1) {
> + struct mbox_request req;
> + struct mbox_reply reply;
> + int n;
> +
> + n = read(0, &req, sizeof(req));
> + if (n != sizeof(req)) {
> + dprintf(debug_fd, "invalid request %d\n", n);
> + return;
> + }
> +
> + reply.status = req.is_set ?
> + handle_set_cmd(&req) :
> + handle_get_cmd(&req);
> +
> + n = write(1, &reply, sizeof(reply));
> + if (n != sizeof(reply)) {
> + dprintf(debug_fd, "reply failed %d\n", n);
> + return;
> + }
> + }
> +}
> +
> +int main(void)
> +{
> + debug_fd = open("/dev/console", 00000002 | 00000100);
> + dprintf(debug_fd, "Started bpfilter\n");
> + loop();
> + close(debug_fd);
> + return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
> +
> +struct mbox_request {
> + __u64 addr;
> + __u32 len;
> + __u32 is_set;
> + __u32 cmd;
> + __u32 pid;
> +};
> +
> +struct mbox_reply {
> + __u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y := route.o inetpeer.o protocol.o \
> inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
> metrics.o
>
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
> obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
> obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
> obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> + char __user *optval,
> + unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> + unsigned int optlen, bool is_set)
> +{
> + if (!bpfilter_process_sockopt) {
> + int err = request_module("bpfilter");
> +
> + if (err)
> + return err;
> + if (!bpfilter_process_sockopt)
> + return -ECHILD;
> + }
> + return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
> + unsigned int optlen)
> +{
> + return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
> + int __user *optlen)
> +{
> + int len;
> +
> + if (get_user(len, optlen))
> + return -EFAULT;
> +
> + return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
> #include <linux/errqueue.h>
> #include <linux/uaccess.h>
>
> +#include <linux/bpfilter.h>
> +
> /*
> * SOL_IP control messages.
> */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
> return -ENOPROTOOPT;
>
> err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> + optname < BPFILTER_IPT_SET_MAX)
> + err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
> int err;
>
> err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> + optname < BPFILTER_IPT_GET_MAX)
> + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
> err = do_ip_getsockopt(sk, level, optname, optval, optlen,
> MSG_CMSG_COMPAT);
>
> +#ifdef CONFIG_BPFILTER
> + if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> + optname < BPFILTER_IPT_GET_MAX)
> + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
> #ifdef CONFIG_NETFILTER
> /* we need to exclude all possible ENOPROTOOPTs except default case */
> if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> --
> 2.9.5
--
Do not panic
^ permalink raw reply
* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Willem de Bruijn @ 2018-05-07 18:57 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <20180507180840.3486.67728.stgit@localhost.localdomain>
On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch allows us to take care of unrolling the first segment and the
> last segment of the loop for processing the segmented skb. Part of the
> motivation for this is that it makes it easier to process the fact that the
> first fame and all of the frames in between should be mostly identical
> in terms of header data, and the last frame has differences in the length
> and partial checksum.
>
> In addition I am dropping the header length calculation since we don't
> really need it for anything but the last frame and it can be easily
> obtained by just pulling the data_len and offset of tail from the transport
> header.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
I'm not a fan of the more complicated control flow, as I pointed out
before. It only seems to save one assignment to uh from segs.
Both follow-up patches are now more complex, because they need
to add the same code in two locations.
^ permalink raw reply
* Re: [PATCH v2] net: dsa: drop some VLAs in switch.c
From: Salvatore Mesoraca @ 2018-05-07 19:02 UTC (permalink / raw)
To: Florian Fainelli
Cc: Andrew Lunn, linux-kernel, Kernel Hardening, netdev,
David S. Miller, Kees Cook, Vivien Didelot, David Laight
In-Reply-To: <d7fa9cf7-7c7e-b01c-8925-ce6dafc8721c@gmail.com>
2018-05-07 20:14 GMT+02:00 Florian Fainelli <f.fainelli@gmail.com>:
> On 05/07/2018 08:23 AM, Salvatore Mesoraca wrote:
>> We avoid 2 VLAs by using a pre-allocated field in dsa_switch.
>> We also try to avoid dynamic allocation whenever possible.
>>
>> Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
>> Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch
>>
>> Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
>> ---
>> include/net/dsa.h | 3 +++
>> net/dsa/dsa2.c | 14 ++++++++++++++
>> net/dsa/switch.c | 22 ++++++++++------------
>> 3 files changed, 27 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/net/dsa.h b/include/net/dsa.h
>> index 60fb4ec..576791d 100644
>> --- a/include/net/dsa.h
>> +++ b/include/net/dsa.h
>> @@ -256,6 +256,9 @@ struct dsa_switch {
>> /* Number of switch port queues */
>> unsigned int num_tx_queues;
>>
>> + unsigned long *bitmap;
>> + unsigned long _bitmap;
>> +
>> /* Dynamically allocated ports, keep last */
>> size_t num_ports;
>> struct dsa_port ports[];
>> diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
>> index adf50fb..cebf35f0 100644
>> --- a/net/dsa/dsa2.c
>> +++ b/net/dsa/dsa2.c
>> @@ -748,6 +748,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
>> if (!ds)
>> return NULL;
>>
>> + /* We avoid allocating memory outside dsa_switch
>> + * if it is not needed.
>> + */
>> + if (n <= sizeof(ds->_bitmap) * 8) {
>> + ds->bitmap = &ds->_bitmap;
>
> Should not this be / BITS_PER_BYTE? If the sizeof(unsigned long) is <=
> 8, then you don't need to allocate it, otherwise, you have to.
No.
We need one 1 bit per port, of course sizeof() returns size in bytes,
hence the multiplication to get the number of bits.
I might multiply per BITS_PER_BYTE instead of 8, but I doubt that
Linux supports implementations where a byte is not an octet.
> I would actually just always dynamically allocate the bitmap, optimizing
> for the case where we have fewer than or 8 ports is not worth IMHO.
This optimization will save us an allocation when number of ports is
less than 32 or 64 (depending on arch).
IMHO it's useful, if you consider that, right now, DSA works only with
12-ports switches.
Thank you for your time,
Salvatore
^ permalink raw reply
* [PATCH net] r8169: fix powering up RTL8168h
From: Heiner Kallweit @ 2018-05-07 19:11 UTC (permalink / raw)
To: David Miller, Realtek linux nic maintainers
Cc: Slava Kardakov, netdev@vger.kernel.org
Since commit a92a08499b1f "r8169: improve runtime pm in general and
suspend unused ports" interfaces w/o link are runtime-suspended after
10s. On systems where drivers take longer to load this can lead to the
situation that the interface is runtime-suspended already when it's
initially brought up.
This shouldn't be a problem because rtl_open() resumes MAC/PHY.
However with at least one chip version the interface doesn't properly
come up, as reported here:
https://bugzilla.kernel.org/show_bug.cgi?id=199549
The vendor driver uses a delay to give certain chip versions some
time to resume before starting the PHY configuration. So let's do
the same. I don't know which chip versions may be affected,
therefore apply this delay always.
This patch was reported to fix the issue for RTL8168h.
I was able to reproduce the issue on an Asus H310I-Plus which also
uses a RTL8168h. Also in my case the patch fixed the issue.
Reported-by: Slava Kardakov <ojab@ojab.ru>
Tested-by: Slava Kardakov <ojab@ojab.ru>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
This patch will not apply to net-next as it conflicts with other
changes which have been done in the meantime. So I'll send a
separate patch for net-next.
---
drivers/net/ethernet/realtek/r8169.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 604ae783..c7aac1fc 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4981,6 +4981,9 @@ static void rtl_pll_power_down(struct rtl8169_private *tp)
static void rtl_pll_power_up(struct rtl8169_private *tp)
{
rtl_generic_op(tp, tp->pll_power_ops.up);
+
+ /* give MAC/PHY some time to resume */
+ msleep(20);
}
static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
--
2.17.0
^ permalink raw reply related
* [PATCH net-next] r8169: fix powering up RTL8168h
From: Heiner Kallweit @ 2018-05-07 19:13 UTC (permalink / raw)
To: David Miller, Realtek linux nic maintainers
Cc: Slava Kardakov, netdev@vger.kernel.org
Since commit a92a08499b1f "r8169: improve runtime pm in general and
suspend unused ports" interfaces w/o link are runtime-suspended after
10s. On systems where drivers take longer to load this can lead to the
situation that the interface is runtime-suspended already when it's
initially brought up.
This shouldn't be a problem because rtl_open() resumes MAC/PHY.
However with at least one chip version the interface doesn't properly
come up, as reported here:
https://bugzilla.kernel.org/show_bug.cgi?id=199549
The vendor driver uses a delay to give certain chip versions some
time to resume before starting the PHY configuration. So let's do
the same. I don't know which chip versions may be affected,
therefore apply this delay always.
This patch was reported to fix the issue for RTL8168h.
I was able to reproduce the issue on an Asus H310I-Plus which also
uses a RTL8168h. Also in my case the patch fixed the issue.
Reported-by: Slava Kardakov <ojab@ojab.ru>
Tested-by: Slava Kardakov <ojab@ojab.ru>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
drivers/net/ethernet/realtek/r8169.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 6d99b141..a60207a5 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4662,6 +4662,9 @@ static void r8168_phy_power_up(struct rtl8169_private *tp)
break;
}
rtl_writephy(tp, MII_BMCR, BMCR_ANENABLE);
+
+ /* give PHY some time to resume */
+ msleep(20);
}
static void r8168_phy_power_down(struct rtl8169_private *tp)
--
2.17.0
^ permalink raw reply related
* Re: [PATCH net] r8169: fix powering up RTL8168h
From: ojab // @ 2018-05-07 19:20 UTC (permalink / raw)
To: Heiner Kallweit
Cc: David Miller, Realtek linux nic maintainers,
netdev@vger.kernel.org
In-Reply-To: <ff344a31-4cb8-1e49-7b5e-3a729125444b@gmail.com>
On Mon, May 7, 2018 at 7:11 PM, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> Since commit a92a08499b1f "r8169: improve runtime pm in general and
> suspend unused ports" interfaces w/o link are runtime-suspended after
> 10s. On systems where drivers take longer to load this can lead to the
> situation that the interface is runtime-suspended already when it's
> initially brought up.
> This shouldn't be a problem because rtl_open() resumes MAC/PHY.
> However with at least one chip version the interface doesn't properly
> come up, as reported here:
> https://bugzilla.kernel.org/show_bug.cgi?id=199549
>
> The vendor driver uses a delay to give certain chip versions some
> time to resume before starting the PHY configuration. So let's do
> the same. I don't know which chip versions may be affected,
> therefore apply this delay always.
>
> This patch was reported to fix the issue for RTL8168h.
> I was able to reproduce the issue on an Asus H310I-Plus which also
> uses a RTL8168h. Also in my case the patch fixed the issue.
>
> Reported-by: Slava Kardakov <ojab@ojab.ru>
> Tested-by: Slava Kardakov <ojab@ojab.ru>
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Maybe also
Cc: stable@vger.kernel.org # v4.16+
?
> ---
> This patch will not apply to net-next as it conflicts with other
> changes which have been done in the meantime. So I'll send a
> separate patch for net-next.
> ---
> drivers/net/ethernet/realtek/r8169.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
> index 604ae783..c7aac1fc 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -4981,6 +4981,9 @@ static void rtl_pll_power_down(struct rtl8169_private *tp)
> static void rtl_pll_power_up(struct rtl8169_private *tp)
> {
> rtl_generic_op(tp, tp->pll_power_ops.up);
> +
> + /* give MAC/PHY some time to resume */
> + msleep(20);
> }
>
> static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
> --
> 2.17.0
>
^ permalink raw reply
* Re: [PATCH v2] net: dsa: drop some VLAs in switch.c
From: Andrew Lunn @ 2018-05-07 19:26 UTC (permalink / raw)
To: Salvatore Mesoraca
Cc: Florian Fainelli, linux-kernel, Kernel Hardening, netdev,
David S. Miller, Kees Cook, Vivien Didelot, David Laight
In-Reply-To: <CAJHCu1KsvPEs9vpp5bY04OeVfMtqZzPuO=9c8e2QP-+n+VKUjQ@mail.gmail.com>
> >> +++ b/include/net/dsa.h
> >> @@ -256,6 +256,9 @@ struct dsa_switch {
> >> /* Number of switch port queues */
> >> unsigned int num_tx_queues;
> >>
> >> + unsigned long *bitmap;
> >> + unsigned long _bitmap;
> >> +
> >> /* Dynamically allocated ports, keep last */
> >> size_t num_ports;
> >> struct dsa_port ports[];
> >> diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
> >> index adf50fb..cebf35f0 100644
> >> --- a/net/dsa/dsa2.c
> >> +++ b/net/dsa/dsa2.c
> >> @@ -748,6 +748,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
> >> if (!ds)
> >> return NULL;
> >>
> >> + /* We avoid allocating memory outside dsa_switch
> >> + * if it is not needed.
> >> + */
> >> + if (n <= sizeof(ds->_bitmap) * 8) {
> >> + ds->bitmap = &ds->_bitmap;
> >
> > Should not this be / BITS_PER_BYTE? If the sizeof(unsigned long) is <=
> > 8, then you don't need to allocate it, otherwise, you have to.
> This optimization will save us an allocation when number of ports is
> less than 32 or 64 (depending on arch).
> IMHO it's useful, if you consider that, right now, DSA works only with
> 12-ports switches.
Do you have a feeling for the savings? I don't see it being very
large, and given the extra code, it might actually be negative.
Andrew
^ permalink raw reply
* Re: [PATCH net] r8169: fix powering up RTL8168h
From: Andrew Lunn @ 2018-05-07 19:30 UTC (permalink / raw)
To: ojab //
Cc: Heiner Kallweit, David Miller, Realtek linux nic maintainers,
netdev@vger.kernel.org
In-Reply-To: <CAKzrAgT8ZEx3rr8G1+GpnMxd1y0D8c=at0n12pRTXfSpzxc1ww@mail.gmail.com>
On Mon, May 07, 2018 at 07:20:53PM +0000, ojab // wrote:
> On Mon, May 7, 2018 at 7:11 PM, Heiner Kallweit <hkallweit1@gmail.com> wrote:
> > Since commit a92a08499b1f "r8169: improve runtime pm in general and
> > suspend unused ports" interfaces w/o link are runtime-suspended after
> > 10s. On systems where drivers take longer to load this can lead to the
> > situation that the interface is runtime-suspended already when it's
> > initially brought up.
> > This shouldn't be a problem because rtl_open() resumes MAC/PHY.
> > However with at least one chip version the interface doesn't properly
> > come up, as reported here:
> > https://bugzilla.kernel.org/show_bug.cgi?id=199549
> >
> > The vendor driver uses a delay to give certain chip versions some
> > time to resume before starting the PHY configuration. So let's do
> > the same. I don't know which chip versions may be affected,
> > therefore apply this delay always.
> >
> > This patch was reported to fix the issue for RTL8168h.
> > I was able to reproduce the issue on an Asus H310I-Plus which also
> > uses a RTL8168h. Also in my case the patch fixed the issue.
> >
> > Reported-by: Slava Kardakov <ojab@ojab.ru>
> > Tested-by: Slava Kardakov <ojab@ojab.ru>
> > Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
>
> Maybe also
> Cc: stable@vger.kernel.org # v4.16+
No need. Heiner correctly marked this for net, not net-next, so David
with do what is needed for it to goto stable.
Andrew
^ permalink raw reply
* Re: [PATCH bpf-next v3 00/15] Introducing AF_XDP support
From: Björn Töpel @ 2018-05-07 19:47 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Magnus Karlsson, Alexei Starovoitov, Daniel Borkmann,
Karlsson, Magnus, Alexander Duyck, Alexander Duyck,
John Fastabend, Alexei Starovoitov, Willem de Bruijn,
Michael S. Tsirkin, Network Development, Björn Töpel,
michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
Zhang, Qi Z
In-Reply-To: <20180507150940.2578d6e3@redhat.com>
2018-05-07 15:09 GMT+02:00 Jesper Dangaard Brouer <brouer@redhat.com>:
> On Mon, 7 May 2018 11:13:58 +0200
> Magnus Karlsson <magnus.karlsson@gmail.com> wrote:
>
>> On Sat, May 5, 2018 at 2:34 AM, Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>> > On Fri, May 04, 2018 at 01:22:17PM +0200, Magnus Karlsson wrote:
>> >> On Fri, May 4, 2018 at 1:38 AM, Alexei Starovoitov
>> >> <alexei.starovoitov@gmail.com> wrote:
>> >> > On Fri, May 04, 2018 at 12:49:09AM +0200, Daniel Borkmann wrote:
>> >> >> On 05/02/2018 01:01 PM, Björn Töpel wrote:
>> >> >> > From: Björn Töpel <bjorn.topel@intel.com>
>> >> >> >
>> >> >> > This patch set introduces a new address family called AF_XDP that is
>> >> >> > optimized for high performance packet processing and, in upcoming
>> >> >> > patch sets, zero-copy semantics. In this patch set, we have removed
>> >> >> > all zero-copy related code in order to make it smaller, simpler and
>> >> >> > hopefully more review friendly. This patch set only supports copy-mode
>> >> >> > for the generic XDP path (XDP_SKB) for both RX and TX and copy-mode
>> >> >> > for RX using the XDP_DRV path. Zero-copy support requires XDP and
>> >> >> > driver changes that Jesper Dangaard Brouer is working on. Some of his
>> >> >> > work has already been accepted. We will publish our zero-copy support
>> >> >> > for RX and TX on top of his patch sets at a later point in time.
>> >> >>
>> >> >> +1, would be great to see it land this cycle. Saw few minor nits here
>> >> >> and there but nothing to hold it up, for the series:
>> >> >>
>> >> >> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>> >> >>
>> >> >> Thanks everyone!
>> >> >
>> >> > Great stuff!
>> >> >
>> >> > Applied to bpf-next, with one condition.
>> >> > Upcoming zero-copy patches for both RX and TX need to be posted
>> >> > and reviewed within this release window.
>> >> > If netdev community as a whole won't be able to agree on the zero-copy
>> >> > bits we'd need to revert this feature before the next merge window.
>> >>
>> >> Thanks everyone for reviewing this. Highly appreciated.
>> >>
>> >> Just so we understand the purpose correctly:
>> >>
>> >> 1: Do you want to see the ZC patches in order to verify that the user
>> >> space API holds? If so, we can produce an additional RFC patch set
>> >> using a big chunk of code that we had in RFC V1. We are not proud of
>> >> this code since it is clunky, but it hopefully proves the point with
>> >> the uapi being the same.
>> >>
>> >> 2: And/Or are you worried about us all (the netdev community) not
>> >> agreeing on a way to implement ZC internally in the drivers and the
>> >> XDP infrastructure? This is not going to be possible to finish during
>> >> this cycle since we do not like the implementation we had in RFC V1.
>> >> Too intrusive and now we also have nicer abstractions from Jesper that
>> >> we can use and extend to provide a (hopefully) much cleaner and less
>> >> intrusive solution.
>> >
>> > short answer: both.
>> >
>> > Cleanliness and performance of the ZC code is not as important as
>> > getting API right. The main concern that during ZC review process
>> > we will find out that existing API has issues, so we have to
>> > do this exercise before the merge window.
>> > And RFC won't fly. Send the patches for real. They have to go
>> > through the proper code review. The hackers of netdev community
>> > can accept a partial, or a bit unclean, or slightly inefficient
>> > implementation, since it can be and will be improved later,
>> > but API we cannot change once it goes into official release.
>> >
>> > Here is the example of API concern:
>> > this patch set added shared umem concept. It sounds good in theory,
>> > but will it perform well with ZC ? Earlier RFCs didn't have that
>> > feature. If it won't perform well than it shouldn't be in the tree.
>> > The key reason to let AF_XDP into the tree is its performance promise.
>> > If it doesn't perform we should rip it out and redesign.
>>
>> That is a fair point. We will try to produce patch sets for zero-copy
>> RX and TX using the latest interfaces within this merge window. Just
>> note that we will focus on this for the next week(s) instead of the
>> review items that you and Daniel Borkmann submitted. If we get those
>> patch sets out in time and we agree that they are a possible way
>> forward, then we produce patches with your fixes. It was mainly small
>> items, so should be quick.
>
> I would like to see that you create a new xdp_mem_type for this new
> zero-copy type. This will allow other XDP redirect methods/types (e.g.
> devmap and cpumap) to react appropriately when receiving a zero-copy
> frame.
>
Yes, that's the plan!
> For devmap, I'm hoping we can allow/support using the ndo_xdp_xmit call
> without (first) copying (into a newly allocated page). By arguing that
> if an xsk-userspace app modify a frame it's not allowed to, then it is
> simply a bug in the program. (Note, this would also allow using
> ndo_xdp_xmit call for TX from xsk-userspace).
>
Makes sense. I think the ZC rational for Rx can indeed be extended for
devmap redirects -- i.e. no frame cloning is required.
> For cpumap, it is hard to avoid a copy, but I'm hoping we could delay
> the copy (and alloc of mem dest area) until on the remote CPU. This is
> already the principle of cpumap; of moving the allocation of the SKB to
> the remote CPU.
>
I think for most AF_XDP applications that would like to pass frames to
the kernel, the cpumap would be preferred instead of XDP_PASS (moving
the stack execution to another off-AF_XDP-thread).
> For ZC to interact with XDP redirect-core and return API, the zero-copy
> memory type/allocator, need to provide an area for the xdp_frame data
> to be stored in (as we cannot allow using top-of-frame like
> non-zero-copy variants), and extend xdp_frame with an ZC umem-id.
> I imagine we can avoid any dynamic allocations, as we upfront (at bind
> and XDP_UMEM_REG time) know the number of frames. (e.g. pre-alloc in
> xdp_umem_reg() call, and have xdp_umem_get_xdp_frame lookup func).
>
Yeah, we can allocate a kernel-side-only xdp_frame for each umem frame.
> --
> Best regards,
> Jesper Dangaard Brouer
> MSc.CS, Principal Kernel Engineer at Red Hat
> LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Willem de Bruijn @ 2018-05-07 19:54 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-JD=WCULdj_u0xjj3S9BDcbHhSCkWXCLHMx04U+Jgz55A@mail.gmail.com>
On Mon, May 7, 2018 at 2:57 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>
>> This patch allows us to take care of unrolling the first segment and the
>> last segment of the loop for processing the segmented skb. Part of the
>> motivation for this is that it makes it easier to process the fact that the
>> first fame and all of the frames in between should be mostly identical
>> in terms of header data, and the last frame has differences in the length
>> and partial checksum.
>>
>> In addition I am dropping the header length calculation since we don't
>> really need it for anything but the last frame and it can be easily
>> obtained by just pulling the data_len and offset of tail from the transport
>> header.
>>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>
> I'm not a fan of the more complicated control flow, as I pointed out
> before. It only seems to save one assignment to uh from segs.
>
> Both follow-up patches are now more complex, because they need
> to add the same code in two locations.
With that said, if you feel strongly, I don't object.
The removal of hdrlen and simplification of arguments is definitely
an improvement.
^ permalink raw reply
* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Alexander Duyck @ 2018-05-07 19:59 UTC (permalink / raw)
To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-KTkmx2jzGAeNVLGWmedCEBx8RQWjJBKZzERGCddEnKWw@mail.gmail.com>
On Mon, May 7, 2018 at 12:54 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Mon, May 7, 2018 at 2:57 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
>> <alexander.duyck@gmail.com> wrote:
>>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>>
>>> This patch allows us to take care of unrolling the first segment and the
>>> last segment of the loop for processing the segmented skb. Part of the
>>> motivation for this is that it makes it easier to process the fact that the
>>> first fame and all of the frames in between should be mostly identical
>>> in terms of header data, and the last frame has differences in the length
>>> and partial checksum.
>>>
>>> In addition I am dropping the header length calculation since we don't
>>> really need it for anything but the last frame and it can be easily
>>> obtained by just pulling the data_len and offset of tail from the transport
>>> header.
>>>
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>>
>> I'm not a fan of the more complicated control flow, as I pointed out
>> before. It only seems to save one assignment to uh from segs.
>>
>> Both follow-up patches are now more complex, because they need
>> to add the same code in two locations.
>
> With that said, if you feel strongly, I don't object.
>
> The removal of hdrlen and simplification of arguments is definitely
> an improvement.
Thanks for being understanding about this.
My preference is to keep the loop unrolled as it is since that way it
is not too different from the way we handle this for TCP so it will
maintenance of the two easier. Otherwise I have to add a bunch of
conditional checks inside the loop.
The other advantage to unrolling it as I did is that I don't have to
deal with a ton of extra indentation for an if statement inside of a
while loop.
- Alex
^ permalink raw reply
* Re: [PATCH net-next v8 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: David Miller @ 2018-05-07 20:03 UTC (permalink / raw)
To: rahul.lakkireddy
Cc: netdev, kexec, linux-fsdevel, linux-kernel, viro, ebiederm,
stephen, akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <cover.1525253481.git.rahul.lakkireddy@chelsio.com>
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 2 May 2018 15:17:16 +0530
> This series of patches add new generic framework that enable device
> drivers to collect device specific snapshot of the hardware/firmware
> state of the underlying device in the crash recovery kernel. In crash
> recovery kernel, the collected logs are added as elf notes to
> /proc/vmcore, which is copied by user space scripts for post-analysis.
Eric B., since you've been giving very useful and active feedback on
this series could you please give it a review?
Thank you.
^ permalink raw reply
* Re: [net-next PATCH v3 0/6] UDP GSO Segmentation clean-ups
From: Alexander Duyck @ 2018-05-07 20:03 UTC (permalink / raw)
To: Netdev, Willem de Bruijn, David Miller
On Mon, May 7, 2018 at 11:08 AM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> This patch set addresses a number of issues I found while sorting out
> enabling UDP GSO Segmentation support for ixgbe/ixgbevf. Specifically there
> were a number of issues related to the checksum and such that seemed to
> cause either minor irregularities or kernel panics in the case of the
> offload request being allowed to traverse between name spaces.
>
> With this set applied I am was able to get UDP GSO traffic to pass over
> vxlan tunnels in both offloaded modes and non-offloaded modes for ixgbe and
> ixgbevf.
>
> I submitted the driver specific patches earlier as an RFC:
> https://patchwork.ozlabs.org/project/netdev/list/?series=42477&archive=both&state=*
>
> v2: Updated patches based on feedback from Eric Dumazet
> Split first patch into several patches based on feedback from Eric
> v3: Drop patch that was calling pskb_may_pull as it was redundant.
> Added code to use MANGLED_0 in case of UDP checksum
> Drop patch adding NETIF_F_GSO_UDP_L4 to list of GSO software offloads
> Added Acked-by for patches reviewed by Willem and not changed
Just noticed I forgot to update the subject before sending out the
cover page. I updated it for this reply. If needed I will submit a v4,
but for now I will leave this out here to finish up review.
Thanks.
- Alex
^ permalink raw reply
* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Willem de Bruijn @ 2018-05-07 20:10 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAKgT0UdBgTnvGBR9Y4KSsKBfebkOiXKJROr8uAMBW2sjFoHadw@mail.gmail.com>
On Mon, May 7, 2018 at 3:59 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> On Mon, May 7, 2018 at 12:54 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Mon, May 7, 2018 at 2:57 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Mon, May 7, 2018 at 2:08 PM, Alexander Duyck
>>> <alexander.duyck@gmail.com> wrote:
>>>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>>>
>>>> This patch allows us to take care of unrolling the first segment and the
>>>> last segment of the loop for processing the segmented skb. Part of the
>>>> motivation for this is that it makes it easier to process the fact that the
>>>> first fame and all of the frames in between should be mostly identical
>>>> in terms of header data, and the last frame has differences in the length
>>>> and partial checksum.
>>>>
>>>> In addition I am dropping the header length calculation since we don't
>>>> really need it for anything but the last frame and it can be easily
>>>> obtained by just pulling the data_len and offset of tail from the transport
>>>> header.
>>>>
>>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
>>> I'm not a fan of the more complicated control flow, as I pointed out
>>> before. It only seems to save one assignment to uh from segs.
>>>
>>> Both follow-up patches are now more complex, because they need
>>> to add the same code in two locations.
>>
>> With that said, if you feel strongly, I don't object.
>>
>> The removal of hdrlen and simplification of arguments is definitely
>> an improvement.
>
> Thanks for being understanding about this.
>
> My preference is to keep the loop unrolled as it is since that way it
> is not too different from the way we handle this for TCP so it will
> maintenance of the two easier. Otherwise I have to add a bunch of
> conditional checks inside the loop.
>
> The other advantage to unrolling it as I did is that I don't have to
> deal with a ton of extra indentation for an if statement inside of a
> while loop.
Both good reasons. Thanks a lot for the overall cleanup.
^ permalink raw reply
* Re: [PATCH] openvswitch: fix internal_dev_xmit()'s return type
From: Gregory Rose @ 2018-05-07 20:42 UTC (permalink / raw)
To: Luc Van Oostenryck, linux-kernel-u79uwXL29TY76Z2rM5mHXA
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
David S. Miller
In-Reply-To: <20180424131747.4711-1-luc.vanoostenryck-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
On 4/24/2018 6:17 AM, Luc Van Oostenryck wrote:
> The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
> which is a typedef for an enum type, but the implementation in this
> driver returns an 'int'.
>
> Fix this by returning 'netdev_tx_t' in this driver too.
>
> Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
> net/openvswitch/vport-internal_dev.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
> index bb95c43aa..3ea55618e 100644
> --- a/net/openvswitch/vport-internal_dev.c
> +++ b/net/openvswitch/vport-internal_dev.c
> @@ -43,7 +43,7 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev)
> }
>
> /* Called with rcu_read_lock_bh. */
> -static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
> +static netdev_tx_t internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
> {
> int len, err;
>
LGTM
Reviewed-by: Greg Rose <gvrose8192-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
^ permalink raw reply
* Re: [PATCH] openvswitch: make vport_ops:send()'s return type consistent
From: Gregory Rose @ 2018-05-07 20:44 UTC (permalink / raw)
To: Luc Van Oostenryck, linux-kernel-u79uwXL29TY76Z2rM5mHXA
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
David S. Miller
In-Reply-To: <20180424131953.6474-1-luc.vanoostenryck-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
On 4/24/2018 6:19 AM, Luc Van Oostenryck wrote:
> The method struct vport_ops:send() is defined as returning an
> 'netdev_tx_t', which is defined as a typedef for a bitwise type
> and otherwise used for the start_xmit() methods.
> However, most openvswitch drivers use for this method dev_queue_xmit()
> which returns an 'int' and the return value of vport_ops:send() is
> in fact never used.
>
> Make things typewise consistent and use 'int' for vport_ops:send()
> as well for internal_dev_recv() (which is the only proper send method)
> as using 'netdev_tx_t' doesn't offer any advantages and in fact seems,
> if not wrong at least, inadequate.
>
> Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
> ---
> net/openvswitch/vport-internal_dev.c | 6 +++---
> net/openvswitch/vport.h | 2 +-
> 2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
> index 3ea55618e..2fd68c2fb 100644
> --- a/net/openvswitch/vport-internal_dev.c
> +++ b/net/openvswitch/vport-internal_dev.c
> @@ -231,7 +231,7 @@ static void internal_dev_destroy(struct vport *vport)
> rtnl_unlock();
> }
>
> -static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
> +static int internal_dev_recv(struct sk_buff *skb)
> {
> struct net_device *netdev = skb->dev;
> struct pcpu_sw_netstats *stats;
> @@ -239,7 +239,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
> if (unlikely(!(netdev->flags & IFF_UP))) {
> kfree_skb(skb);
> netdev->stats.rx_dropped++;
> - return NETDEV_TX_OK;
> + return 0;
> }
>
> skb_dst_drop(skb);
> @@ -257,7 +257,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
> u64_stats_update_end(&stats->syncp);
>
> netif_rx(skb);
> - return NETDEV_TX_OK;
> + return 0;
> }
>
> static struct vport_ops ovs_internal_vport_ops = {
> diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
> index cda66c26a..8dcb48fe8 100644
> --- a/net/openvswitch/vport.h
> +++ b/net/openvswitch/vport.h
> @@ -141,7 +141,7 @@ struct vport_ops {
> int (*set_options)(struct vport *, struct nlattr *);
> int (*get_options)(const struct vport *, struct sk_buff *);
>
> - netdev_tx_t (*send) (struct sk_buff *skb);
> + int (*send) (struct sk_buff *skb);
> struct module *owner;
> struct list_head list;
> };
Yes, it does seem odd to use a tx return type for receive. Nice fixup.
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
_______________________________________________
dev mailing list
dev@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev
^ permalink raw reply
* Re: The SO_BINDTODEVICE was set to the desired interface, but packets are received from all interfaces.
From: David Ahern @ 2018-05-07 21:20 UTC (permalink / raw)
To: Ben Greear, Damir Mansurov, netdev
Cc: Konstantin Ushakov, Alexandra N. Kossovsky, Andrey Dmitrov
In-Reply-To: <5361bef8-bdf9-af3d-12ae-a128b6502d2e@candelatech.com>
On 5/7/18 10:14 AM, Ben Greear wrote:
> On 05/07/2018 03:19 AM, Damir Mansurov wrote:
>>
>> Greetings,
>>
>> After successful call of the setsockopt(SO_BINDTODEVICE) function to
>> set data reception from only one interface, the data is still received
>> from all interfaces.
>> Function setsockopt() returns 0 but then recv() receives data from all
>> available network interfaces.
>>
>> The problem is reproducible on linux kernels 4.14 - 4.16, but it does
>> not on linux kernels 4.4, 4.13.
>>
>> I have written C-code to reproduce this issue (see attached files
>> b2d_send.c and b2d_recv.c). See below explanation of tested
>> configuration.
>
> Hello,
>
> I am not sure if this is your problem or not, but if you are using VRF,
> then you need
> to call SO_BINDTODEVICE before you do the 'normal' bind() call.
>
This is a different problem -- socket lookup is matching when it should not.
^ permalink raw reply
* Re: RTL8723BE performance regression
From: João Paulo Rechi Vita @ 2018-05-07 21:49 UTC (permalink / raw)
To: Pkshih
Cc: Larry.Finger@lwfinger.net, linux-kernel@vger.kernel.org,
jprvita@endlessm.com, Birming Chiu, drake@endlessm.com,
Chaoming_Li, kvalo@codeaurora.org, 莊彥宣,
derosier@gmail.com, Steven Ting, netdev@vger.kernel.org,
linux@endlessm.com, Shaofu, linux-wireless@vger.kernel.org
In-Reply-To: <1525240713.3735.3.camel@realtek.com>
On Tue, May 1, 2018 at 10:58 PM, Pkshih <pkshih@realtek.com> wrote:
> On Wed, 2018-05-02 at 05:44 +0000, Pkshih wrote:
>>
>> > -----Original Message-----
>> > From: João Paulo Rechi Vita [mailto:jprvita@gmail.com]
>> > Sent: Wednesday, May 02, 2018 6:41 AM
>> > To: Larry Finger
>> > Cc: Steve deRosier; 莊彥宣; Pkshih; Birming Chiu; Shaofu; Steven Ting; Chaoming_Li; Kalle Valo;
>> > linux-wireless; Network Development; LKML; Daniel Drake; João Paulo Rechi Vita; linux@endlessm.c
>> om
>> > Subject: Re: RTL8723BE performance regression
>> >
>> > On Tue, Apr 3, 2018 at 7:51 PM, Larry Finger <Larry.Finger@lwfinger.net> wrote:
>> > > On 04/03/2018 09:37 PM, João Paulo Rechi Vita wrote:
>> > >>
>> > >> On Tue, Apr 3, 2018 at 7:28 PM, Larry Finger <Larry.Finger@lwfinger.net>
>> > >> wrote:
>> > >>
>> > >> (...)
>> > >>
>> > >>> As the antenna selection code changes affected your first bisection, do
>> > >>> you
>> > >>> have one of those HP laptops with only one antenna and the incorrect
>> > >>> coding
>> > >>> in the FUSE?
>> > >>
>> > >>
>> > >> Yes, that is why I've been passing ant_sel=1 during my tests -- this
>> > >> was needed to achieve a good performance in the past, before this
>> > >> regression. I've also opened the laptop chassis and confirmed the
>> > >> antenna cable is plugged to the connector labeled with "1" on the
>> > >> card.
>> > >>
>> > >>> If so, please make sure that you still have the same signal
>> > >>> strength for good and bad cases. I have tried to keep the driver and the
>> > >>> btcoex code in sync, but there may be some combinations of antenna
>> > >>> configuration and FUSE contents that cause the code to fail.
>> > >>>
>> > >>
>> > >> What is the recommended way to monitor the signal strength?
>> > >
>> > >
>> > > The btcoex code is developed for multiple platforms by a different group
>> > > than the Linux driver. I think they made a change that caused ant_sel to
>> > > switch from 1 to 2. At least numerous comments at
>> > > github.com/lwfinger/rtlwifi_new claimed they needed to make that change.
>> > >
>> > > Mhy recommended method is to verify the wifi device name with "iw dev". Then
>> > > using that device
>> > >
>> > > sudo iw dev <dev_name> scan | egrep "SSID|signal"
>> > >
>> >
>> > I have confirmed that the performance regression is indeed tied to
>> > signal strength: on the good cases signal was between -16 and -8 dBm,
>> > whereas in bad cases signal was always between -50 to - 40 dBm. I've
>> > also switched to testing bandwidth in controlled LAN environment using
>> > iperf3, as suggested by Steve deRosier, with the DUT being the only
>> > machine connected to the 2.4 GHz radio and the machine running the
>> > iperf3 server connected via ethernet.
>> >
>>
>> We have new experimental results in commit af8a41cccf8f46 ("rtlwifi: cleanup
>> 8723be ant_sel definition"). You can use the above commit and do the same
>> experiments (with ant_sel=0, 1 and 2) in your side, and then share your results.
>> Since performance is tied to signal strength, you can only share signal strength.
>>
>
> Please pay attention to cold reboot once ant_sel is changed.
>
I've tested the commit mentioned above and it fixes the problem on top
of v4.16 (in addition to the latest wireless-drivers-next also been
fixed as it already contains such commit). On v4.15, we also need the
following commits before "af8a41cccf8f rtlwifi: cleanup 8723be ant_sel
definition" to have a good performance again:
874e837d67d0 rtlwifi: fill FW version and subversion
a44709bba70f rtlwifi: btcoex: Add power_on_setting routine
40d9dd4f1c5d rtlwifi: btcoex: Remove global variables from btcoex
Surprisingly, it seems forcing ant_sel=1 is not needed anymore on
these machines, as the shown by the numbers bellow (ant_sel=0 means
that actually no parameter was passed to the module). I have powered
off the machine and done a cold boot for every test. It seems
something have changed in the antenna auto-selection code since v4.11,
the latest point where I could confirm we definitely need to force
ant_sel=1. I've been trying to understand what causes this difference,
but haven't made progress on that so far, so any suggestions are
appreciated (we are trying to decide if we can confidently drop the
downstream DMI quirks for these specific machines).
w-d-n ant_sel=0: -14.00 dBm, 69.5 Mbps -> good
w-d-n ant_sel=1: -10.00 dBm, 41.1 Mbps -> good
w-d-n ant_sel=2: -44.00 dBm, 607 kbps -> bad
v4.16 ant_sel=0: -12.00 dBm, 63.0 Mbps -> good
v4.16 ant_sel=1: - 8.00 dBm, 69.0 Mbps -> good
v4.16 ant_sel=2: -50.00 dBm, 224 kbps -> bad
v4.15 ant_sel=0: - 8.00 dBm, 33.0 Mbps -> good
v4.15 ant_sel=1: -10.00 dBm, 38.1 Mbps -> good
v4.15 ant_sel=2: -48.00 dBm, 206 kbps -> bad
--
João Paulo Rechi Vita
http://about.me/jprvita
^ permalink raw reply
* [PATCH net-next v10 1/4] virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit
From: Sridhar Samudrala @ 2018-05-07 22:10 UTC (permalink / raw)
To: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
jasowang, loseweigh, jiri, aaron.f.brown
In-Reply-To: <1525731046-10989-1-git-send-email-sridhar.samudrala@intel.com>
This feature bit can be used by hypervisor to indicate virtio_net device to
act as a standby for another device with the same MAC address.
VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit.
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
drivers/net/virtio_net.c | 2 +-
include/uapi/linux/virtio_net.h | 3 +++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..213fddc70fd0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2999,7 +2999,7 @@ static struct virtio_device_id id_table[] = {
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
VIRTIO_NET_F_CTRL_MAC_ADDR, \
VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
- VIRTIO_NET_F_SPEED_DUPLEX
+ VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
static unsigned int features[] = {
VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..a3715a3224c1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
* Steering */
#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
+#define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device
+ * with the same MAC.
+ */
#define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */
#ifndef VIRTIO_NET_NO_LEGACY
--
2.14.3
^ permalink raw reply related
* [PATCH net-next v10 2/4] net: Introduce generic failover module
From: Sridhar Samudrala @ 2018-05-07 22:10 UTC (permalink / raw)
To: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, sridhar.samudrala,
jasowang, loseweigh, jiri, aaron.f.brown
In-Reply-To: <1525731046-10989-1-git-send-email-sridhar.samudrala@intel.com>
This provides a generic interface for paravirtual drivers to listen
for netdev register/unregister/link change events from pci ethernet
devices with the same MAC and takeover their datapath. The notifier and
event handling code is based on the existing netvsc implementation.
It exposes 2 sets of interfaces to the paravirtual drivers.
1. For paravirtual drivers like virtio_net that use 3 netdev model, the
the failover module provides interfaces to create/destroy additional
master netdev and all the slave events are managed internally.
net_failover_create()
net_failover_destroy()
A failover netdev is created that acts a master device and controls 2
slave devices. The original virtio_net netdev is registered as 'standby'
netdev and a passthru/vf device with the same MAC gets registered as
'primary' netdev. Both 'standby' and 'failover' netdevs are associated
with the same 'pci' device. The user accesses the network interface via
'failover' netdev. The 'failover' netdev chooses 'primary' netdev as
default for transmits when it is available with link up and running.
2. For existing netvsc driver that uses 2 netdev model, no master netdev
is created. The paravirtual driver registers each instance of netvsc
as a 'failover' netdev along with a set of ops to manage the slave
events. There is no 'standby' netdev in this model. A passthru/vf device
with the same MAC gets registered as 'primary' netdev.
net_failover_register()
net_failover_unregister()
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
MAINTAINERS | 7 +
include/linux/netdevice.h | 16 +
include/net/net_failover.h | 52 +++
net/Kconfig | 10 +
net/core/Makefile | 1 +
net/core/net_failover.c | 1044 ++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1130 insertions(+)
create mode 100644 include/net/net_failover.h
create mode 100644 net/core/net_failover.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ebe0b9ed7805..83cbd99d8efa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9638,6 +9638,13 @@ S: Maintained
F: Documentation/hwmon/nct6775
F: drivers/hwmon/nct6775.c
+NET_FAILOVER MODULE
+M: Sridhar Samudrala <sridhar.samudrala@intel.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: net/core/net_failover.c
+F: include/net/net_failover.h
+
NETEFFECT IWARP RNIC DRIVER (IW_NES)
M: Faisal Latif <faisal.latif@intel.com>
L: linux-rdma@vger.kernel.org
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46dcb5f7522f..4fff9b5d079e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1421,6 +1421,8 @@ struct net_device_ops {
* entity (i.e. the master device for bridged veth)
* @IFF_MACSEC: device is a MACsec device
* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
@@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
IFF_PHONY_HEADROOM = 1<<24,
IFF_MACSEC = 1<<25,
IFF_NO_RX_HANDLER = 1<<26,
+ IFF_FAILOVER = 1<<27,
+ IFF_FAILOVER_SLAVE = 1<<28,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1478,6 +1482,8 @@ enum netdev_priv_flags {
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
+#define IFF_FAILOVER IFF_FAILOVER
+#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
/**
* struct net_device - The DEVICE structure.
@@ -4320,6 +4326,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
return dev->priv_flags & IFF_RXFH_CONFIGURED;
}
+static inline bool netif_is_failover(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER;
+}
+
+static inline bool netif_is_failover_slave(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER_SLAVE;
+}
+
/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
diff --git a/include/net/net_failover.h b/include/net/net_failover.h
new file mode 100644
index 000000000000..221c2aff7531
--- /dev/null
+++ b/include/net/net_failover.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _NET_FAILOVER_H
+#define _NET_FAILOVER_H
+
+#include <linux/netdevice.h>
+
+struct net_failover_ops {
+ int (*slave_register)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_unregister)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_link_change)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+};
+
+struct net_failover {
+ struct list_head list;
+ struct net_device __rcu *failover_dev;
+ struct net_failover_ops __rcu *ops;
+};
+
+/* failover state */
+struct net_failover_info {
+ /* primary netdev with same MAC */
+ struct net_device __rcu *primary_dev;
+
+ /* standby netdev */
+ struct net_device __rcu *standby_dev;
+
+ /* primary netdev stats */
+ struct rtnl_link_stats64 primary_stats;
+
+ /* standby netdev stats */
+ struct rtnl_link_stats64 standby_stats;
+
+ /* aggregated stats */
+ struct rtnl_link_stats64 failover_stats;
+
+ /* spinlock while updating stats */
+ spinlock_t stats_lock;
+};
+
+struct net_failover *net_failover_create(struct net_device *standby_dev);
+void net_failover_destroy(struct net_failover *failover);
+struct net_failover *net_failover_register(struct net_device *dev,
+ struct net_failover_ops *ops);
+void net_failover_unregister(struct net_failover *failover);
+int net_failover_slave_unregister(struct net_device *slave_dev);
+
+#endif /* _NET_FAILOVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index b62089fb1332..0540856676de 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -429,6 +429,16 @@ config MAY_USE_DEVLINK
config PAGE_POOL
bool
+config NET_FAILOVER
+ tristate "Failover interface"
+ default m
+ help
+ This provides a generic interface for paravirtual drivers to listen
+ for netdev register/unregister/link change events from pci ethernet
+ devices with the same MAC and takeover their datapath. This also
+ enables live migration of a VM with direct attached VF by failing
+ over to the paravirtual datapath when the VF is unplugged.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 7080417f8bc8..283ed9b0e581 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_NET_FAILOVER) += net_failover.o
diff --git a/net/core/net_failover.c b/net/core/net_failover.c
new file mode 100644
index 000000000000..8d60e74e3034
--- /dev/null
+++ b/net/core/net_failover.c
@@ -0,0 +1,1044 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver and failover
+ * netdev management routines are based on bond/team driver.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <net/sch_generic.h>
+#include <uapi/linux/if_arp.h>
+#include <net/net_failover.h>
+
+static LIST_HEAD(net_failover_list);
+static DEFINE_SPINLOCK(net_failover_lock);
+
+static bool net_failover_xmit_ready(struct net_device *dev)
+{
+ return netif_running(dev) && netif_carrier_ok(dev);
+}
+
+static int net_failover_open(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int err;
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ err = dev_open(primary_dev);
+ if (err)
+ goto err_primary_open;
+ }
+
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ err = dev_open(standby_dev);
+ if (err)
+ goto err_standby_open;
+ }
+
+ if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+ (standby_dev && net_failover_xmit_ready(standby_dev))) {
+ netif_carrier_on(dev);
+ netif_tx_wake_all_queues(dev);
+ }
+
+ return 0;
+
+err_standby_open:
+ dev_close(primary_dev);
+err_primary_open:
+ netif_tx_disable(dev);
+ return err;
+}
+
+static int net_failover_close(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ netif_tx_disable(dev);
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ dev_close(slave_dev);
+
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ dev_close(slave_dev);
+
+ return 0;
+}
+
+static netdev_tx_t net_failover_drop_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ atomic_long_inc(&dev->tx_dropped);
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *xmit_dev;
+
+ /* Try xmit via primary netdev followed by standby netdev */
+ xmit_dev = rcu_dereference_bh(nfo_info->primary_dev);
+ if (!xmit_dev || !net_failover_xmit_ready(xmit_dev)) {
+ xmit_dev = rcu_dereference_bh(nfo_info->standby_dev);
+ if (!xmit_dev || !net_failover_xmit_ready(xmit_dev))
+ return net_failover_drop_xmit(skb, dev);
+ }
+
+ skb->dev = xmit_dev;
+ skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+ return dev_queue_xmit(skb);
+}
+
+static u16 net_failover_select_queue(struct net_device *dev,
+ struct sk_buff *skb, void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev;
+ u16 txq;
+
+ rcu_read_lock();
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ const struct net_device_ops *ops = primary_dev->netdev_ops;
+
+ if (ops->ndo_select_queue)
+ txq = ops->ndo_select_queue(primary_dev, skb,
+ accel_priv, fallback);
+ else
+ txq = fallback(primary_dev, skb);
+
+ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+ return txq;
+ }
+
+ txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+
+ /* Save the original txq to restore before passing to the driver */
+ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+ if (unlikely(txq >= dev->real_num_tx_queues)) {
+ do {
+ txq -= dev->real_num_tx_queues;
+ } while (txq >= dev->real_num_tx_queues);
+ }
+
+ return txq;
+}
+
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void net_failover_fold_stats(struct rtnl_link_stats64 *_res,
+ const struct rtnl_link_stats64 *_new,
+ const struct rtnl_link_stats64 *_old)
+{
+ const u64 *new = (const u64 *)_new;
+ const u64 *old = (const u64 *)_old;
+ u64 *res = (u64 *)_res;
+ int i;
+
+ for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+ u64 nv = new[i];
+ u64 ov = old[i];
+ s64 delta = nv - ov;
+
+ /* detects if this particular field is 32bit only */
+ if (((nv | ov) >> 32) == 0)
+ delta = (s64)(s32)((u32)nv - (u32)ov);
+
+ /* filter anomalies, some drivers reset their stats
+ * at down/up events.
+ */
+ if (delta > 0)
+ res[i] += delta;
+ }
+}
+
+static void net_failover_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ const struct rtnl_link_stats64 *new;
+ struct rtnl_link_stats64 temp;
+ struct net_device *slave_dev;
+
+ spin_lock(&nfo_info->stats_lock);
+ memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
+
+ rcu_read_lock();
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev) {
+ new = dev_get_stats(slave_dev, &temp);
+ net_failover_fold_stats(stats, new, &nfo_info->primary_stats);
+ memcpy(&nfo_info->primary_stats, new, sizeof(*new));
+ }
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev) {
+ new = dev_get_stats(slave_dev, &temp);
+ net_failover_fold_stats(stats, new, &nfo_info->standby_stats);
+ memcpy(&nfo_info->standby_stats, new, sizeof(*new));
+ }
+
+ rcu_read_unlock();
+
+ memcpy(&nfo_info->failover_stats, stats, sizeof(*stats));
+ spin_unlock(&nfo_info->stats_lock);
+}
+
+static int net_failover_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int ret = 0;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ ret = dev_set_mtu(primary_dev, new_mtu);
+ if (ret)
+ return ret;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ ret = dev_set_mtu(standby_dev, new_mtu);
+ if (ret) {
+ if (primary_dev)
+ dev_set_mtu(primary_dev, dev->mtu);
+ return ret;
+ }
+ }
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+static void net_failover_set_rx_mode(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ rcu_read_lock();
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev) {
+ dev_uc_sync_multiple(slave_dev, dev);
+ dev_mc_sync_multiple(slave_dev, dev);
+ }
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev) {
+ dev_uc_sync_multiple(slave_dev, dev);
+ dev_mc_sync_multiple(slave_dev, dev);
+ }
+
+ rcu_read_unlock();
+}
+
+static int net_failover_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int ret = 0;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ ret = vlan_vid_add(primary_dev, proto, vid);
+ if (ret)
+ return ret;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ ret = vlan_vid_add(standby_dev, proto, vid);
+ if (ret)
+ if (primary_dev)
+ vlan_vid_del(primary_dev, proto, vid);
+ }
+
+ return ret;
+}
+
+static int net_failover_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ vlan_vid_del(slave_dev, proto, vid);
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ vlan_vid_del(slave_dev, proto, vid);
+
+ return 0;
+}
+
+static const struct net_device_ops failover_dev_ops = {
+ .ndo_open = net_failover_open,
+ .ndo_stop = net_failover_close,
+ .ndo_start_xmit = net_failover_start_xmit,
+ .ndo_select_queue = net_failover_select_queue,
+ .ndo_get_stats64 = net_failover_get_stats,
+ .ndo_change_mtu = net_failover_change_mtu,
+ .ndo_set_rx_mode = net_failover_set_rx_mode,
+ .ndo_vlan_rx_add_vid = net_failover_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = net_failover_vlan_rx_kill_vid,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_features_check = passthru_features_check,
+};
+
+#define FAILOVER_NAME "net_failover"
+#define FAILOVER_VERSION "0.1"
+
+static void nfo_ethtool_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->driver, FAILOVER_NAME, sizeof(drvinfo->driver));
+ strlcpy(drvinfo->version, FAILOVER_VERSION, sizeof(drvinfo->version));
+}
+
+static int nfo_ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+ cmd->base.duplex = DUPLEX_UNKNOWN;
+ cmd->base.port = PORT_OTHER;
+ cmd->base.speed = SPEED_UNKNOWN;
+
+ return 0;
+ }
+ }
+
+ return __ethtool_get_link_ksettings(slave_dev, cmd);
+}
+
+static const struct ethtool_ops failover_ethtool_ops = {
+ .get_drvinfo = nfo_ethtool_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = nfo_ethtool_get_link_ksettings,
+};
+
+static struct net_device *net_failover_get_bymac(u8 *mac,
+ struct net_failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct net_failover *failover;
+
+ spin_lock(&net_failover_lock);
+ list_for_each_entry(failover, &net_failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&net_failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&net_failover_lock);
+ return NULL;
+}
+
+/* Called when slave dev is injecting data into network stack.
+ * Change the associated network device from lower dev to failover dev.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t net_failover_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct net_device *dev = rcu_dereference(skb->dev->rx_handler_data);
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+
+ if (primary_dev && skb->dev == standby_dev)
+ return RX_HANDLER_EXACT;
+
+ skb->dev = dev;
+
+ return RX_HANDLER_ANOTHER;
+}
+
+#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
+ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+ NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
+ NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
+
+static void net_failover_compute_features(struct net_device *dev)
+{
+ u32 vlan_features = FAILOVER_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+ netdev_features_t enc_features = FAILOVER_ENC_FEATURES;
+ unsigned short max_hard_header_len = ETH_HLEN;
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM;
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ vlan_features =
+ netdev_increment_features(vlan_features,
+ primary_dev->vlan_features,
+ FAILOVER_VLAN_FEATURES);
+ enc_features =
+ netdev_increment_features(enc_features,
+ primary_dev->hw_enc_features,
+ FAILOVER_ENC_FEATURES);
+
+ dst_release_flag &= primary_dev->priv_flags;
+ if (primary_dev->hard_header_len > max_hard_header_len)
+ max_hard_header_len = primary_dev->hard_header_len;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ vlan_features =
+ netdev_increment_features(vlan_features,
+ standby_dev->vlan_features,
+ FAILOVER_VLAN_FEATURES);
+ enc_features =
+ netdev_increment_features(enc_features,
+ standby_dev->hw_enc_features,
+ FAILOVER_ENC_FEATURES);
+
+ dst_release_flag &= standby_dev->priv_flags;
+ if (standby_dev->hard_header_len > max_hard_header_len)
+ max_hard_header_len = standby_dev->hard_header_len;
+ }
+
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+ dev->hard_header_len = max_hard_header_len;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if (dst_release_flag == (IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ netdev_change_features(dev);
+}
+
+static void net_failover_lower_state_changed(struct net_device *slave_dev,
+ struct net_device *primary_dev,
+ struct net_device *standby_dev)
+{
+ struct netdev_lag_lower_state_info info;
+
+ if (netif_carrier_ok(slave_dev))
+ info.link_up = true;
+ else
+ info.link_up = false;
+
+ if (slave_dev == primary_dev) {
+ if (netif_running(primary_dev))
+ info.tx_enabled = true;
+ else
+ info.tx_enabled = false;
+ } else {
+ if ((primary_dev && netif_running(primary_dev)) ||
+ (!netif_running(standby_dev)))
+ info.tx_enabled = false;
+ else
+ info.tx_enabled = true;
+ }
+
+ netdev_lower_state_changed(slave_dev, &info);
+}
+
+/**
+ * net_failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. For a 2 netdev model,
+ * this will be primary netdev. In case of a 3 netdev model, it can be a
+ * standby or a primary netdev.
+ *
+ */
+static int net_failover_slave_register(struct net_device *slave_dev)
+{
+ struct net_device *standby_dev, *primary_dev, *failover_dev;
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_failover_info *nfo_info;
+ struct net_failover_ops *nfo_ops;
+ bool slave_is_standby;
+ u32 orig_mtu;
+ int err;
+
+ ASSERT_RTNL();
+
+ failover_dev = net_failover_get_bymac(slave_dev->perm_addr, &nfo_ops);
+ if (!failover_dev)
+ goto done;
+
+ if (failover_dev->type != slave_dev->type)
+ goto done;
+
+ if (nfo_ops && nfo_ops->slave_register)
+ return nfo_ops->slave_register(slave_dev, failover_dev);
+
+ nfo_info = netdev_priv(failover_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+ if (slave_is_standby ? standby_dev : primary_dev) {
+ netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
+ slave_dev->name,
+ slave_is_standby ? "standby" : "primary");
+ goto done;
+ }
+
+ /* We want to allow only a direct attached VF device as a primary
+ * netdev. As there is no easy way to check for a VF device, restrict
+ * this to a pci device.
+ */
+ if (!slave_is_standby && (!slave_dev->dev.parent ||
+ !dev_is_pci(slave_dev->dev.parent)))
+ goto done;
+
+ if (failover_dev->features & NETIF_F_VLAN_CHALLENGED &&
+ vlan_uses_dev(failover_dev)) {
+ netdev_err(failover_dev, "Device %s is VLAN challenged and failover device has VLAN set up\n",
+ failover_dev->name);
+ goto done;
+ }
+
+ /* Align MTU of slave with failover dev */
+ orig_mtu = slave_dev->mtu;
+ err = dev_set_mtu(slave_dev, failover_dev->mtu);
+ if (err) {
+ netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
+ slave_dev->name, failover_dev->mtu);
+ goto done;
+ }
+
+ dev_hold(slave_dev);
+
+ if (netif_running(failover_dev)) {
+ err = dev_open(slave_dev);
+ if (err && (err != -EBUSY)) {
+ netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
+ slave_dev->name, err);
+ goto err_dev_open;
+ }
+ }
+
+ netif_addr_lock_bh(failover_dev);
+ dev_uc_sync_multiple(slave_dev, failover_dev);
+ dev_uc_sync_multiple(slave_dev, failover_dev);
+ netif_addr_unlock_bh(failover_dev);
+
+ err = vlan_vids_add_by_dev(slave_dev, failover_dev);
+ if (err) {
+ netdev_err(failover_dev, "Failed to add vlan ids to device %s err:%d\n",
+ slave_dev->name, err);
+ goto err_vlan_add;
+ }
+
+ err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto err_handler_register;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+ if (slave_is_standby) {
+ rcu_assign_pointer(nfo_info->standby_dev, slave_dev);
+ standby_dev = slave_dev;
+ dev_get_stats(standby_dev, &nfo_info->standby_stats);
+ } else {
+ rcu_assign_pointer(nfo_info->primary_dev, slave_dev);
+ primary_dev = slave_dev;
+ dev_get_stats(primary_dev, &nfo_info->primary_stats);
+ failover_dev->min_mtu = slave_dev->min_mtu;
+ failover_dev->max_mtu = slave_dev->max_mtu;
+ }
+
+ net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+ net_failover_compute_features(failover_dev);
+
+ call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
+
+ netdev_info(failover_dev, "failover %s slave:%s registered\n",
+ slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+ goto done;
+
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+err_handler_register:
+ vlan_vids_del_by_dev(slave_dev, failover_dev);
+err_vlan_add:
+ dev_uc_unsync(slave_dev, failover_dev);
+ dev_mc_unsync(slave_dev, failover_dev);
+ dev_close(slave_dev);
+err_dev_open:
+ dev_put(slave_dev);
+ dev_set_mtu(slave_dev, orig_mtu);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * net_failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance. For a 2 netdev model,
+ * this will be primary netdev. In case of a 3 netdev model, it can be a
+ * standby or a primary netdev.
+ *
+ */
+int net_failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *standby_dev, *primary_dev, *failover_dev;
+ struct net_failover_info *nfo_info;
+ struct net_failover_ops *nfo_ops;
+ bool slave_is_standby;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = net_failover_get_bymac(slave_dev->perm_addr, &nfo_ops);
+ if (!failover_dev)
+ goto done;
+
+ if (nfo_ops && nfo_ops->slave_unregister)
+ return nfo_ops->slave_unregister(slave_dev, failover_dev);
+
+ nfo_info = netdev_priv(failover_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ goto done;
+
+ slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ vlan_vids_del_by_dev(slave_dev, failover_dev);
+ dev_uc_unsync(slave_dev, failover_dev);
+ dev_mc_unsync(slave_dev, failover_dev);
+ dev_close(slave_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+ nfo_info = netdev_priv(failover_dev);
+ net_failover_get_stats(failover_dev, &nfo_info->failover_stats);
+
+ if (slave_is_standby) {
+ RCU_INIT_POINTER(nfo_info->standby_dev, NULL);
+ } else {
+ RCU_INIT_POINTER(nfo_info->primary_dev, NULL);
+ if (standby_dev) {
+ failover_dev->min_mtu = standby_dev->min_mtu;
+ failover_dev->max_mtu = standby_dev->max_mtu;
+ }
+ }
+
+ dev_put(slave_dev);
+
+ net_failover_compute_features(failover_dev);
+
+ netdev_info(failover_dev, "failover %s slave:%s unregistered\n",
+ slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(net_failover_slave_unregister);
+
+static int net_failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev, *primary_dev, *standby_dev;
+ struct net_failover_info *nfo_info;
+ struct net_failover_ops *nfo_ops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = net_failover_get_bymac(slave_dev->perm_addr, &nfo_ops);
+ if (!failover_dev)
+ goto done;
+
+ if (nfo_ops && nfo_ops->slave_link_change)
+ return nfo_ops->slave_link_change(slave_dev, failover_dev);
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ nfo_info = netdev_priv(failover_dev);
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ goto done;
+
+ if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+ (standby_dev && net_failover_xmit_ready(standby_dev))) {
+ netif_carrier_on(failover_dev);
+ netif_tx_wake_all_queues(failover_dev);
+ } else {
+ net_failover_get_stats(failover_dev, &nfo_info->failover_stats);
+ netif_carrier_off(failover_dev);
+ netif_tx_stop_all_queues(failover_dev);
+ }
+
+ net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int net_failover_slave_change_name(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev, *primary_dev, *standby_dev;
+ struct net_failover_info *nfo_info;
+ struct net_failover_ops *nfo_ops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = net_failover_get_bymac(slave_dev->perm_addr, &nfo_ops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ nfo_info = netdev_priv(failover_dev);
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ goto done;
+
+ /* We need to bring up the slave after the rename by udev in case
+ * open failed with EBUSY when it was registered.
+ */
+ dev_open(slave_dev);
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+net_failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return net_failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return net_failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return net_failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return net_failover_slave_change_name(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block net_failover_notifier = {
+ .notifier_call = net_failover_event,
+};
+
+static void
+net_failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ net_failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * net_failover_register - Register a failover instance
+ *
+ * @dev: failover or standby netdev
+ * @ops: failover ops
+ *
+ * Paravirtual drivers supporting 3-netdev model call this routine indirectly
+ * via net_failover_create(). It passes failover netdev and ops will be NULL
+ * as the slave events are handled internally.
+ * Paravirtual drivers supporting 2-netdev model call this routine by passing
+ * standby netdev and ops that are called to handle slave register/unregister/
+ * link change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct net_failover *net_failover_register(struct net_device *dev,
+ struct net_failover_ops *ops)
+{
+ struct net_failover *failover;
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&net_failover_lock);
+ list_add_tail(&failover->list, &net_failover_list);
+ spin_unlock(&net_failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ net_failover_existing_slave_register(dev);
+
+ return failover;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(net_failover_register);
+
+/**
+ * net_failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void net_failover_unregister(struct net_failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&net_failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&net_failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(net_failover_unregister);
+
+/**
+ * net_failover_create - Create and register a failover instance
+ *
+ * @dev: standby netdev
+ *
+ * Creates a failover netdev and registers a failover instance for a standby
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ * The failover netdev acts as a master device and controls 2 slave devices -
+ * the original standby netdev and a VF netdev with the same MAC gets
+ * registered as primary netdev.
+ *
+ * Return: pointer to failover instance
+ */
+struct net_failover *net_failover_create(struct net_device *standby_dev)
+{
+ struct device *dev = standby_dev->dev.parent;
+ struct net_device *failover_dev;
+ struct net_failover *failover;
+ int err;
+
+ /* Alloc at least 2 queues, for now we are going with 16 assuming
+ * that VF devices being enslaved won't have too many queues.
+ */
+ failover_dev = alloc_etherdev_mq(sizeof(struct net_failover_info), 16);
+ if (!failover_dev) {
+ dev_err(dev, "Unable to allocate failover_netdev!\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dev_net_set(failover_dev, dev_net(standby_dev));
+ SET_NETDEV_DEV(failover_dev, dev);
+
+ failover_dev->netdev_ops = &failover_dev_ops;
+ failover_dev->ethtool_ops = &failover_ethtool_ops;
+
+ /* Initialize the device options */
+ failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
+ failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
+ IFF_TX_SKB_SHARING);
+
+ /* don't acquire failover netdev's netif_tx_lock when transmitting */
+ failover_dev->features |= NETIF_F_LLTX;
+
+ /* Don't allow failover devices to change network namespaces. */
+ failover_dev->features |= NETIF_F_NETNS_LOCAL;
+
+ failover_dev->hw_features = FAILOVER_VLAN_FEATURES |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_CTAG_RX |
+ NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+ failover_dev->features |= failover_dev->hw_features;
+
+ memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
+ failover_dev->addr_len);
+
+ failover_dev->min_mtu = standby_dev->min_mtu;
+ failover_dev->max_mtu = standby_dev->max_mtu;
+
+ err = register_netdev(failover_dev);
+ if (err) {
+ dev_err(dev, "Unable to register failover_dev!\n");
+ goto err_register_netdev;
+ }
+
+ netif_carrier_off(failover_dev);
+
+ failover = net_failover_register(failover_dev, NULL);
+ if (IS_ERR(failover))
+ goto err_failover_register;
+
+ return failover;
+
+err_failover_register:
+ unregister_netdev(failover_dev);
+err_register_netdev:
+ free_netdev(failover_dev);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(net_failover_create);
+
+/**
+ * net_failover_destroy - Destroy a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters any slave netdevs associated with the failover instance by
+ * calling net_failover_slave_unregister().
+ * unregisters the failover instance itself and finally frees the failover
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ *
+ */
+void net_failover_destroy(struct net_failover *failover)
+{
+ struct net_failover_info *nfo_info;
+ struct net_device *failover_dev;
+ struct net_device *slave_dev;
+
+ if (!failover)
+ return;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+ nfo_info = netdev_priv(failover_dev);
+
+ netif_device_detach(failover_dev);
+
+ rtnl_lock();
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ net_failover_slave_unregister(slave_dev);
+
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ net_failover_slave_unregister(slave_dev);
+
+ net_failover_unregister(failover);
+
+ unregister_netdevice(failover_dev);
+
+ rtnl_unlock();
+
+ free_netdev(failover_dev);
+}
+EXPORT_SYMBOL_GPL(net_failover_destroy);
+
+static __init int
+net_failover_init(void)
+{
+ register_netdevice_notifier(&net_failover_notifier);
+
+ return 0;
+}
+module_init(net_failover_init);
+
+static __exit
+void net_failover_exit(void)
+{
+ unregister_netdevice_notifier(&net_failover_notifier);
+}
+module_exit(net_failover_exit);
+
+MODULE_DESCRIPTION("Failover infrastructure/interface for Paravirtual drivers");
+MODULE_LICENSE("GPL v2");
--
2.14.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox