Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next 1/3] sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint
From: Marcelo Ricardo Leitner @ 2018-10-22 14:17 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Neil Horman, davem
In-Reply-To: <08b533092a01a8f7cf2eb4c459fe3570a8df702b.1540095102.git.lucien.xin@gmail.com>

On Sun, Oct 21, 2018 at 12:43:36PM +0800, Xin Long wrote:
> This is a part of sk_reuseport support for sctp, and it selects a
> sock by the hashkey of lport, paddr and dport by default. It will
> work until sk_reuseport support is added in sctp_get_port_local()
> in the next patch.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/sctp/input.c | 69 +++++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 41 insertions(+), 28 deletions(-)
> 
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index 5c36a99..60ede89 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -57,6 +57,7 @@
>  #include <net/sctp/checksum.h>
>  #include <net/net_namespace.h>
>  #include <linux/rhashtable.h>
> +#include <net/sock_reuseport.h>
>  
>  /* Forward declarations for internal helpers. */
>  static int sctp_rcv_ootb(struct sk_buff *);
> @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
>  				      const union sctp_addr *paddr,
>  				      const union sctp_addr *laddr,
>  				      struct sctp_transport **transportp);
> -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> -						const union sctp_addr *laddr);
> +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> +					struct net *net, struct sk_buff *skb,
> +					const union sctp_addr *laddr,
> +					const union sctp_addr *daddr);
>  static struct sctp_association *__sctp_lookup_association(
>  					struct net *net,
>  					const union sctp_addr *local,
> @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
>  	asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
>  
>  	if (!asoc)
> -		ep = __sctp_rcv_lookup_endpoint(net, &dest);
> +		ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);
>  
>  	/* Retrieve the common input handling substructure. */
>  	rcvr = asoc ? &asoc->base : &ep->base;
> @@ -770,16 +773,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
>  	local_bh_enable();
>  }
>  
> +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
> +				const union sctp_addr *paddr, __u32 seed)
> +{
> +	__u32 addr;
> +
> +	if (paddr->sa.sa_family == AF_INET6)
> +		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
> +	else
> +		addr = (__force __u32)paddr->v4.sin_addr.s_addr;
> +
> +	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> +			     (__force __u32)lport, net_hash_mix(net), seed);
> +}
> +
>  /* Look up an endpoint. */
> -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> -						const union sctp_addr *laddr)
> +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> +					struct net *net, struct sk_buff *skb,
> +					const union sctp_addr *laddr,
> +					const union sctp_addr *paddr)
>  {
>  	struct sctp_hashbucket *head;
>  	struct sctp_ep_common *epb;
>  	struct sctp_endpoint *ep;
> +	struct sock *sk;
> +	__be32 lport;

This could be a __be16 one.

>  	int hash;
>  
> -	hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
> +	lport = laddr->v4.sin_port;
> +	hash = sctp_ep_hashfn(net, ntohs(lport));
>  	head = &sctp_ep_hashtable[hash];
>  	read_lock(&head->lock);
>  	sctp_for_each_hentry(epb, &head->chain) {
> @@ -791,6 +813,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
>  	ep = sctp_sk(net->sctp.ctl_sock)->ep;
>  
>  hit:
> +	sk = ep->base.sk;
> +	if (sk->sk_reuseport) {
> +		__u32 phash = sctp_hashfn(net, lport, paddr, 0);
> +
> +		sk = reuseport_select_sock(sk, phash, skb,
> +					   sizeof(struct sctphdr));
> +		if (sk)
> +			ep = sctp_sk(sk)->ep;
> +	}
>  	sctp_endpoint_hold(ep);
>  	read_unlock(&head->lock);
>  	return ep;
> @@ -829,35 +860,17 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
>  static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
>  {
>  	const struct sctp_transport *t = data;
> -	const union sctp_addr *paddr = &t->ipaddr;
> -	const struct net *net = sock_net(t->asoc->base.sk);
> -	__be16 lport = htons(t->asoc->base.bind_addr.port);
> -	__u32 addr;
> -
> -	if (paddr->sa.sa_family == AF_INET6)
> -		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
> -	else
> -		addr = (__force __u32)paddr->v4.sin_addr.s_addr;
>  
> -	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> -			     (__force __u32)lport, net_hash_mix(net), seed);
> +	return sctp_hashfn(sock_net(t->asoc->base.sk),
> +			   htons(t->asoc->base.bind_addr.port),
> +			   &t->ipaddr, seed);
>  }
>  
>  static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
>  {
>  	const struct sctp_hash_cmp_arg *x = data;
> -	const union sctp_addr *paddr = x->paddr;
> -	const struct net *net = x->net;
> -	__be16 lport = x->lport;
> -	__u32 addr;
> -
> -	if (paddr->sa.sa_family == AF_INET6)
> -		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
> -	else
> -		addr = (__force __u32)paddr->v4.sin_addr.s_addr;
>  
> -	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> -			     (__force __u32)lport, net_hash_mix(net), seed);
> +	return sctp_hashfn(x->net, x->lport, x->paddr, seed);
>  }
>  
>  static const struct rhashtable_params sctp_hash_params = {
> -- 
> 2.1.0
> 

^ permalink raw reply

* Re: [PATCH net-next 0/3] sctp: add support for sk_reuseport
From: Marcelo Ricardo Leitner @ 2018-10-22 14:20 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Neil Horman, davem
In-Reply-To: <cover.1540095102.git.lucien.xin@gmail.com>

On Sun, Oct 21, 2018 at 12:43:35PM +0800, Xin Long wrote:
> sctp sk_reuseport allows multiple socks to listen on the same port and
> addresses, as long as these socks have the same uid. This works pretty
> much as TCP/UDP does, the only difference is that sctp is multi-homing
> and all the bind_addrs in these socks will have to completely matched,
> otherwise listen() will return err.
> 

FWIW, I won't be able to review this patchset thoroughly. The 2 small
comments that I sent are all I have.

Thanks,
Marcelo

^ permalink raw reply

* Re: [PATCH 1/1] ptp: ptp_dte: simplify getting .driver_data
From: Richard Cochran @ 2018-10-22 22:43 UTC (permalink / raw)
  To: Wolfram Sang; +Cc: linux-kernel, linux-renesas-soc, netdev
In-Reply-To: <20181021200039.1933-2-wsa+renesas@sang-engineering.com>

On Sun, Oct 21, 2018 at 10:00:39PM +0200, Wolfram Sang wrote:
> We should get 'driver_data' from 'struct device' directly. Going via
> platform_device is an unneeded step back and forth.

Acked-by: Richard Cochran <richardcochran@gmail.com>

^ permalink raw reply

* Re: [PATCH net-next 1/6] qed: Add doorbell overflow recovery mechanism
From: kbuild test robot @ 2018-10-22 14:26 UTC (permalink / raw)
  To: Ariel Elior
  Cc: kbuild-all, davem, netdev, Ariel Elior, Michal Kalderon,
	Tomer Tayar
In-Reply-To: <20181022122743.20384-2-Ariel.Elior@cavium.com>

[-- Attachment #1: Type: text/plain, Size: 2021 bytes --]

Hi Ariel,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Ariel-Elior/qed-Doorbell-overflow-recovery/20181022-212749
config: i386-randconfig-x005-201842 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   In file included from drivers/net/ethernet/qlogic/qed/qed_dev.c:47:0:
   drivers/net/ethernet/qlogic/qed/qed_dev.c: In function 'qed_db_recovery_ring':
>> include/linux/qed/qed_if.h:466:40: error: implicit declaration of function 'writeq'; did you mean 'writel'? [-Werror=implicit-function-declaration]
    #define DIRECT_REG_WR64(reg_addr, val) writeq((u32)val, \
                                           ^
>> drivers/net/ethernet/qlogic/qed/qed_dev.c:342:4: note: in expansion of macro 'DIRECT_REG_WR64'
       DIRECT_REG_WR64(db_entry->db_addr,
       ^~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors
--
   In file included from drivers/net//ethernet/qlogic/qed/qed_dev.c:47:0:
   drivers/net//ethernet/qlogic/qed/qed_dev.c: In function 'qed_db_recovery_ring':
>> include/linux/qed/qed_if.h:466:40: error: implicit declaration of function 'writeq'; did you mean 'writel'? [-Werror=implicit-function-declaration]
    #define DIRECT_REG_WR64(reg_addr, val) writeq((u32)val, \
                                           ^
   drivers/net//ethernet/qlogic/qed/qed_dev.c:342:4: note: in expansion of macro 'DIRECT_REG_WR64'
       DIRECT_REG_WR64(db_entry->db_addr,
       ^~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +466 include/linux/qed/qed_if.h

   465	
 > 466	#define DIRECT_REG_WR64(reg_addr, val) writeq((u32)val,	\
   467						      (void __iomem *)(reg_addr))
   468	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29652 bytes --]

^ permalink raw reply

* [PATCH bpf] bpf: add bpf_jit_limit knob to restrict unpriv allocations
From: Daniel Borkmann @ 2018-10-22 23:11 UTC (permalink / raw)
  To: ast
  Cc: rick.p.edgecombe, eric.dumazet, jannh, keescook, linux-kernel,
	netdev, Daniel Borkmann

Rick reported that the BPF JIT could potentially fill the entire module
space with BPF programs from unprivileged users which would prevent later
attempts to load normal kernel modules or privileged BPF programs, for
example. If JIT was enabled but unsuccessful to generate the image, then
before commit 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config")
we would always fall back to the BPF interpreter. Nowadays in the case
where the CONFIG_BPF_JIT_ALWAYS_ON could be set, then the load will abort
with a failure since the BPF interpreter was compiled out.

Add a global limit and enforce it for unprivileged users such that in case
of BPF interpreter compiled out we fail once the limit has been reached
or we fall back to BPF interpreter earlier w/o using module mem if latter
was compiled in. In a next step, fair share among unprivileged users can
be resolved in particular for the case where we would fail hard once limit
is reached.

Fixes: 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config")
Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64")
Co-Developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: LKML <linux-kernel@vger.kernel.org>
---
 Hi Rick, I've reworked the original patch into something much simpler
 which is only focussing on the actual main issue we want to resolve right
 now as a first step to make some forward progress, that is, limiting usage
 on the JIT for unprivileged users. Tested the below on x86 and arm64.
 (Trimmed down massive Cc list as well a bit and Cc'ed people related to
 commits referenced and netdev where BPF patches are usually discussed.)
 Thanks a lot!

 Documentation/sysctl/net.txt |  8 ++++++++
 include/linux/filter.h       |  1 +
 kernel/bpf/core.c            | 49 +++++++++++++++++++++++++++++++++++++++++---
 net/core/sysctl_net_core.c   | 10 +++++++--
 4 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 9ecde51..2793d4e 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -92,6 +92,14 @@ Values :
 	0 - disable JIT kallsyms export (default value)
 	1 - enable JIT kallsyms export for privileged users only
 
+bpf_jit_limit
+-------------
+
+This enforces a global limit for memory allocations to the BPF JIT
+compiler in order to reject unprivileged JIT requests once it has
+been surpassed. bpf_jit_limit contains the value of the global limit
+in bytes.
+
 dev_weight
 --------------
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 91b4c93..de629b7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -854,6 +854,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
 extern int bpf_jit_kallsyms;
+extern int bpf_jit_limit;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c7eeea..6377225 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -365,10 +365,13 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
 }
 
 #ifdef CONFIG_BPF_JIT
+# define BPF_JIT_LIMIT_DEFAULT	(PAGE_SIZE * 40000)
+
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
 int bpf_jit_harden   __read_mostly;
 int bpf_jit_kallsyms __read_mostly;
+int bpf_jit_limit    __read_mostly = BPF_JIT_LIMIT_DEFAULT;
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
@@ -577,27 +580,64 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 	return ret;
 }
 
+static atomic_long_t bpf_jit_current;
+
+#if defined(MODULES_VADDR)
+static int __init bpf_jit_charge_init(void)
+{
+	/* Only used as heuristic here to derive limit. */
+	bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2,
+					    PAGE_SIZE), INT_MAX);
+	return 0;
+}
+pure_initcall(bpf_jit_charge_init);
+#endif
+
+static int bpf_jit_charge_modmem(u32 pages)
+{
+	if (atomic_long_add_return(pages, &bpf_jit_current) >
+	    (bpf_jit_limit >> PAGE_SHIFT)) {
+		if (!capable(CAP_SYS_ADMIN)) {
+			atomic_long_sub(pages, &bpf_jit_current);
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+static void bpf_jit_uncharge_modmem(u32 pages)
+{
+	atomic_long_sub(pages, &bpf_jit_current);
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_binary_header *hdr;
-	unsigned int size, hole, start;
+	u32 size, hole, start, pages;
 
 	/* Most of BPF filters are really small, but if some of them
 	 * fill a page, allow at least 128 extra bytes to insert a
 	 * random section of illegal instructions.
 	 */
 	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+	pages = size / PAGE_SIZE;
+
+	if (bpf_jit_charge_modmem(pages))
+		return NULL;
 	hdr = module_alloc(size);
-	if (hdr == NULL)
+	if (!hdr) {
+		bpf_jit_uncharge_modmem(pages);
 		return NULL;
+	}
 
 	/* Fill space with illegal/arch-dep instructions. */
 	bpf_fill_ill_insns(hdr, size);
 
-	hdr->pages = size / PAGE_SIZE;
+	hdr->pages = pages;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -610,7 +650,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
+	u32 pages = hdr->pages;
+
 	module_memfree(hdr);
+	bpf_jit_uncharge_modmem(pages);
 }
 
 /* This symbol is only overridden by archs that have different
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b1a2c5e..37b4667 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -279,7 +279,6 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
 	return ret;
 }
 
-# ifdef CONFIG_HAVE_EBPF_JIT
 static int
 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 				    void __user *buffer, size_t *lenp,
@@ -290,7 +289,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
-# endif
 #endif
 
 static struct ctl_table net_core_table[] = {
@@ -397,6 +395,14 @@ static struct ctl_table net_core_table[] = {
 		.extra2		= &one,
 	},
 # endif
+	{
+		.procname	= "bpf_jit_limit",
+		.data		= &bpf_jit_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
+		.extra1		= &one,
+	},
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH 0/2] Mark expected switch fall-throughs and fix missing break
From: Gustavo A. R. Silva @ 2018-10-22 14:53 UTC (permalink / raw)
  To: Jes Sorensen, linux-kernel
  Cc: Kalle Valo, linux-wireless, David S. Miller, netdev
In-Reply-To: <a1b9b68a-ca94-f02f-4a87-bbd46782c41c@gmail.com>


On 10/22/18 4:36 PM, Jes Sorensen wrote:
> On 10/22/18 7:49 AM, Gustavo A. R. Silva wrote:
>> In preparation to enabling -Wimplicit-fallthrough, this patchset aims
>> to mark multiple switch cases where we are expecting to fall through.
>>
>> Also, the second patch in this series fixes a missing break in switch.
> 
> Enabling that sounds like a great way to inflict pain and suffering.
> 

Not really. The -Wimplicit-fallthrough will be enabled until after all the
current warnings have been addressed.

There are 600 of these issues left. So, hopefully I will complete this task
during the next development cycle.

Thanks

^ permalink raw reply

* Re: CRC errors between mvneta and macb
From: Richard Genoud @ 2018-10-22 15:15 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: linux-kernel, Thomas Petazzoni, Antoine Tenart, Gregory CLEMENT,
	Yelena Krivosheev, Maxime Chevallier, Nicolas Ferre, netdev
In-Reply-To: <c24489f6-3b9a-f26b-bc7f-b564d5567294@sorico.fr>

Le 22/10/2018 à 08:51, Richard Genoud a écrit :
> Le 19/10/2018 à 17:44, Willy Tarreau a écrit :
>> On Fri, Oct 19, 2018 at 05:15:03PM +0200, Richard Genoud wrote:
>>> When there's a CRC error, the TXCLK has its polarity inverted...
>>> That's a clue !
>>>
>>> But this TXCLK (25MHz) is not used on the g35-ek.
>>> Only the REFCLK/XT2 (50MHz) is used to synchronise the PHY and the macb.
>>> So I guess that the TXCLK has a role to play to generate TX+/TX-
>>
>> Well, just a stupid idea, maybe when this signal is inverted, the TX+/TX-
>> are desynchronized by half a clock and are not always properly interpreted
>> on the other side ?
>>
>> Willy
>>
> 
> I must admit that I'm not familiar with the PHY internals, I'll have to
> dig into that.
> 
> Richard.
> 

I dug more on the subject, and I think I found what Marvell's PHY/MAC
doesn't like.

First of all, I forced the liaison at 10Mbits full duplex on both sides,
as the Manchester code is "easier" to decode than the 4B5B-MLT3 used for
fast ethernet.

Fortunately, the FCS errors are still present on 10Mbits/s.

After analyzing the ethernet frame on the Davicom PHY's output (pin
TX+), I find out that the FCS errors occurs when the ethernet preamble
is longer than 56bits. (something like 58 or 60 bits)

To say this in another way, instead of having 28 times 1-0 followed by
the SFD (10101011), I see 29 or 30 times 1-0 followed by the SFD.
(sometimes 29, sometimes 30)


Should a longer preamble be considered as an FCS error ? It seems a
little harsh since the point of the preamble is to synchronize the frame.

I don't know what the 802.3 standard says about that.


Regards,
Richard.

^ permalink raw reply

* Re: [RFC PATCH v2 02/10] udp: implement GRO for plain UDP sockets.
From: Willem de Bruijn @ 2018-10-22 15:15 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: Network Development, Willem de Bruijn, steffen.klassert
In-Reply-To: <c046191abff49ddaef72b870182d4e922b8d80a2.camel@redhat.com>

On Mon, Oct 22, 2018 at 6:13 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Sun, 2018-10-21 at 16:06 -0400, Willem de Bruijn wrote:
> > On Fri, Oct 19, 2018 at 10:30 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
> > > with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
> > > eligible for GRO in the rx path: UDP segments directed to such socket
> > > are assembled into a larger GSO_UDP_L4 packet.
> > >
> > > The core UDP GRO support is enabled with setsockopt(UDP_GRO).
> > >
> > > Initial benchmark numbers:
> > >
> > > Before:
> > > udp rx:   1079 MB/s   769065 calls/s
> > >
> > > After:
> > > udp rx:   1466 MB/s    24877 calls/s
> > >
> > >
> > > This change introduces a side effect in respect to UDP tunnels:
> > > after a UDP tunnel creation, now the kernel performs a lookup per ingress
> > > UDP packet, while before such lookup happened only if the ingress packet
> > > carried a valid internal header csum.
> > >
> > > v1 -> v2:
> > >  - use a new option to enable UDP GRO
> > >  - use static keys to protect the UDP GRO socket lookup
> > >
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > >  include/linux/udp.h      |   3 +-
> > >  include/uapi/linux/udp.h |   1 +
> > >  net/ipv4/udp.c           |   7 +++
> > >  net/ipv4/udp_offload.c   | 109 +++++++++++++++++++++++++++++++--------
> > >  net/ipv6/udp_offload.c   |   6 +--
> > >  5 files changed, 98 insertions(+), 28 deletions(-)
> > >
> > > diff --git a/include/linux/udp.h b/include/linux/udp.h
> > > index a4dafff407fb..f613b329852e 100644
> > > --- a/include/linux/udp.h
> > > +++ b/include/linux/udp.h
> > > @@ -50,11 +50,12 @@ struct udp_sock {
> > >         __u8             encap_type;    /* Is this an Encapsulation socket? */
> > >         unsigned char    no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
> > >                          no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
> > > -                        encap_enabled:1; /* This socket enabled encap
> > > +                        encap_enabled:1, /* This socket enabled encap
> > >                                            * processing; UDP tunnels and
> > >                                            * different encapsulation layer set
> > >                                            * this
> > >                                            */
> > > +                        gro_enabled:1; /* Can accept GRO packets */
> > >
> > >         /*
> > >          * Following member retains the information to create a UDP header
> > >          * when the socket is uncorked.
> > > diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
> > > index 09502de447f5..30baccb6c9c4 100644
> > > --- a/include/uapi/linux/udp.h
> > > +++ b/include/uapi/linux/udp.h
> > > @@ -33,6 +33,7 @@ struct udphdr {
> > >  #define UDP_NO_CHECK6_TX 101   /* Disable sending checksum for UDP6X */
> > >  #define UDP_NO_CHECK6_RX 102   /* Disable accpeting checksum for UDP6 */
> > >  #define UDP_SEGMENT    103     /* Set GSO segmentation size */
> > > +#define UDP_GRO                104     /* This socket can receive UDP GRO packets */
> > >
> > >  /* UDP encapsulation types */
> > >  #define UDP_ENCAP_ESPINUDP_NON_IKE     1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
> > > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > > index 9fcb5374e166..3c277378814f 100644
> > > --- a/net/ipv4/udp.c
> > > +++ b/net/ipv4/udp.c
> > > @@ -115,6 +115,7 @@
> > >  #include "udp_impl.h"
> > >  #include <net/sock_reuseport.h>
> > >  #include <net/addrconf.h>
> > > +#include <net/udp_tunnel.h>
> > >
> > >  struct udp_table udp_table __read_mostly;
> > >  EXPORT_SYMBOL(udp_table);
> > > @@ -2459,6 +2460,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
> > >                 up->gso_size = val;
> > >                 break;
> > >
> > > +       case UDP_GRO:
> > > +               if (valbool)
> > > +                       udp_tunnel_encap_enable(sk->sk_socket);
> > > +               up->gro_enabled = valbool;
> >
> > The socket lock is not held here, so multiple updates to
> > up->gro_enabled and the up->encap_enabled and the static branch can
> > race. Syzkaller is adept at generating those.
>
> Good catch. I was fooled by the current existing code. I think there
> are potentially similar issues for UDP_ENCAP, UDPLITE_SEND_CSCOV, ...
>
> Since the rx path don't take it anymore and we don't risk starving, I
> think we should could/always acquire the socket lock on setsockopt,
> wdyt?

Agreed. We had to add a lot of those in packet_setsockopt for the same reason.

^ permalink raw reply

* [PATCH RFC net-next 2/3] tcp: BPF_TCP_INFO_NOTIFY support
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
  To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
In-Reply-To: <cover.1540220847.git.sowmini.varadhan@oracle.com>

We want to be able to set up the monitoring application so that it can
be aysnchronously notified when "interesting" events happen, e.g., when
application-determined thresholds on parameters like RTT estimate, number
of retransmissions, RTO are reached.

The bpf_sock_ops infrastructure provided as part of Commit 40304b2a1567
("bpf: BPF support for sock_ops") provides an elegant way to trigger
this asynchronous notification. The BPF program can examine the
current TCP state reported in the bpf_sock_ops and conditionally
return a (new) status BPF_TCP_INFO_NOTIFY. The return status is used
by the caller to queue up a tcp_info notification for the application.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 include/net/tcp.h        |   15 +++++++++++++--
 include/uapi/linux/bpf.h |    4 ++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d29292..df06a9f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -47,6 +47,7 @@
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/sock_diag.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -2065,6 +2066,12 @@ struct tcp_ulp_ops {
 	__MODULE_INFO(alias, alias_userspace, name);		\
 	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
 
+#define	TCPDIAG_CB(sk)							\
+do {									\
+	if (unlikely(sk->sk_net_refcnt && sock_diag_has_listeners(sk)))	\
+		sock_diag_broadcast(sk);				\
+} while (0)
+
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
  * program does not support the chosen operation or there is no BPF
@@ -2088,9 +2095,13 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 		memcpy(sock_ops.args, args, nargs * sizeof(*args));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
-	if (ret == 0)
+	if (ret == 0) {
 		ret = sock_ops.reply;
-	else
+
+		/* XXX would be nice if we could use replylong[1] here */
+		if (ret == BPF_TCP_INFO_NOTIFY)
+			TCPDIAG_CB(sk);
+	} else
 		ret = -1;
 	return ret;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2..bc45e5e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2678,6 +2678,10 @@ enum {
 	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
+enum {
+	BPF_TCP_INFO_NOTIFY = 2
+};
+
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
  To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
In-Reply-To: <cover.1540220847.git.sowmini.varadhan@oracle.com>

We want to use the inet_sock_diag_destroy code to send notifications
for more types of TCP events than just socket_close(), so refactor
the code to allow this.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 include/linux/sock_diag.h      |   18 +++++++++++++-----
 include/uapi/linux/sock_diag.h |    2 ++
 net/core/sock.c                |    4 ++--
 net/core/sock_diag.c           |   11 ++++++-----
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 15fe980..df85767 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -34,7 +34,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
 			     struct sk_buff *skb, int attrtype);
 
 static inline
-enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
+enum sknetlink_groups sock_diag_group(const struct sock *sk)
 {
 	switch (sk->sk_family) {
 	case AF_INET:
@@ -43,7 +43,15 @@ enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
 
 		switch (sk->sk_protocol) {
 		case IPPROTO_TCP:
-			return SKNLGRP_INET_TCP_DESTROY;
+			switch (sk->sk_state) {
+			case TCP_ESTABLISHED:
+				return SKNLGRP_INET_TCP_CONNECTED;
+			case TCP_SYN_SENT:
+			case TCP_SYN_RECV:
+				return SKNLGRP_INET_TCP_3WH;
+			default:
+				return SKNLGRP_INET_TCP_DESTROY;
+			}
 		case IPPROTO_UDP:
 			return SKNLGRP_INET_UDP_DESTROY;
 		default:
@@ -67,15 +75,15 @@ enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
 }
 
 static inline
-bool sock_diag_has_destroy_listeners(const struct sock *sk)
+bool sock_diag_has_listeners(const struct sock *sk)
 {
 	const struct net *n = sock_net(sk);
-	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+	const enum sknetlink_groups group = sock_diag_group(sk);
 
 	return group != SKNLGRP_NONE && n->diag_nlsk &&
 		netlink_has_listeners(n->diag_nlsk, group);
 }
-void sock_diag_broadcast_destroy(struct sock *sk);
+void sock_diag_broadcast(struct sock *sk);
 
 int sock_diag_destroy(struct sock *sk, int err);
 #endif
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index e592500..4252674 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -32,6 +32,8 @@ enum sknetlink_groups {
 	SKNLGRP_INET_UDP_DESTROY,
 	SKNLGRP_INET6_TCP_DESTROY,
 	SKNLGRP_INET6_UDP_DESTROY,
+	SKNLGRP_INET_TCP_3WH,
+	SKNLGRP_INET_TCP_CONNECTED,
 	__SKNLGRP_MAX,
 };
 #define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e8796a..6684840 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1600,8 +1600,8 @@ static void __sk_free(struct sock *sk)
 	if (likely(sk->sk_net_refcnt))
 		sock_inuse_add(sock_net(sk), -1);
 
-	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
-		sock_diag_broadcast_destroy(sk);
+	if (unlikely(sk->sk_net_refcnt && sock_diag_has_listeners(sk)))
+		sock_diag_broadcast(sk);
 	else
 		sk_destruct(sk);
 }
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 3312a58..dbd9e65 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -116,14 +116,14 @@ static size_t sock_diag_nlmsg_size(void)
 	       + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
 }
 
-static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+static void sock_diag_broadcast_work(struct work_struct *work)
 {
 	struct broadcast_sk *bsk =
 		container_of(work, struct broadcast_sk, work);
 	struct sock *sk = bsk->sk;
 	const struct sock_diag_handler *hndl;
 	struct sk_buff *skb;
-	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+	const enum sknetlink_groups group = sock_diag_group(sk);
 	int err = -1;
 
 	WARN_ON(group == SKNLGRP_NONE);
@@ -144,11 +144,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
 	else
 		kfree_skb(skb);
 out:
-	sk_destruct(sk);
+	if (group <= SKNLGRP_INET6_UDP_DESTROY)
+		sk_destruct(sk);
 	kfree(bsk);
 }
 
-void sock_diag_broadcast_destroy(struct sock *sk)
+void sock_diag_broadcast(struct sock *sk)
 {
 	/* Note, this function is often called from an interrupt context. */
 	struct broadcast_sk *bsk =
@@ -156,7 +157,7 @@ void sock_diag_broadcast_destroy(struct sock *sk)
 	if (!bsk)
 		return sk_destruct(sk);
 	bsk->sk = sk;
-	INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+	INIT_WORK(&bsk->work, sock_diag_broadcast_work);
 	queue_work(broadcast_wq, &bsk->work);
 }
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH RFC net-next 3/3] bpf: Added a sample for tcp_info_notify callback
From: Sowmini Varadhan @ 2018-10-22 15:24 UTC (permalink / raw)
  To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
In-Reply-To: <cover.1540220847.git.sowmini.varadhan@oracle.com>

Simple Proof-Of-Concept test program for BPF_TCP_INFO_NOTIFY
(will move this to testing/selftests/net later)

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 samples/bpf/Makefile          |    1 +
 samples/bpf/tcp_notify_kern.c |   73 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 0 deletions(-)
 create mode 100644 samples/bpf/tcp_notify_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index be0a961..d937bd2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -152,6 +152,7 @@ always += tcp_bufs_kern.o
 always += tcp_cong_kern.o
 always += tcp_iw_kern.o
 always += tcp_clamp_kern.o
+always += tcp_notify_kern.o
 always += tcp_basertt_kern.o
 always += tcp_tos_reflect_kern.o
 always += xdp_redirect_kern.o
diff --git a/samples/bpf/tcp_notify_kern.c b/samples/bpf/tcp_notify_kern.c
new file mode 100644
index 0000000..bc4efd8
--- /dev/null
+++ b/samples/bpf/tcp_notify_kern.c
@@ -0,0 +1,73 @@
+/* Sample BPF program to demonstrate how to triger async tcp_info
+ * notification based on thresholds determeined in the filter.
+ * The example here will trigger notification if  skops->total_retrans > 16
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 0
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+SEC("sockops")
+int bpf_tcp_info_notify(struct bpf_sock_ops *skops)
+{
+	int bufsize = 150000;
+	int to_init = 10;
+	int clamp = 100;
+	int rv = 0;
+	int op;
+
+	/* For testing purposes, only execute rest of BPF program
+	 * if neither port numberis 5001 (default iperf port)
+	 */
+	if (bpf_ntohl(skops->remote_port) != 5001 &&
+	    skops->local_port != 5001) {
+		skops->reply = -1;
+		return 0;
+	}
+
+	op = (int) skops->op;
+
+#ifdef DEBUG
+	bpf_printk("BPF command: %d\n", op);
+#endif
+
+	switch (op) {
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		bpf_sock_ops_cb_flags_set(skops,
+			(BPF_SOCK_OPS_RETRANS_CB_FLAG|
+			BPF_SOCK_OPS_RTO_CB_FLAG));
+		rv = 1;
+		break;
+	case BPF_SOCK_OPS_RETRANS_CB:
+	case BPF_SOCK_OPS_RTO_CB:
+		if (skops->total_retrans < 16)
+			rv = 1; /* skip */
+		else
+			rv = BPF_TCP_INFO_NOTIFY;
+		break;
+	default:
+		rv = -1;
+	}
+#ifdef DEBUG
+	bpf_printk("Returning %d\n", rv);
+#endif
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
1.7.1

^ permalink raw reply related

* [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
  To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo

Problem statement:
  We would like to monitor some subset of TCP sockets in user-space,
  (the monitoring application would define 4-tuples it wants to monitor)
  using TCP_INFO stats to analyze reported problems. The idea is to
  use those stats to see where the bottlenecks are likely to be ("is it
  application-limited?" or "is there evidence of BufferBloat in the
  path?" etc)

  Today we can do this by periodically polling for tcp_info, but this
  could be made more efficient if the kernel would asynchronously
  notify the application via tcp_info when some "interesting"
  thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc)
  are reached. And to make this effective, it is better if
  we could apply the threshold check *before* constructing the
  tcp_info netlink notification, so that we don't waste resources
  constructing notifications that will be discarded by the filter.

One idea, implemented in this patchset, is to extend the tcp_call_bpf()
infra so that the BPF kernel module (the sock_ops filter/callback)
can examine the values in the sock_ops, apply any thresholds it wants,
and return some new status ("BPF_TCP_INFO_NOTIFY"). Use this status in
the tcp stack to queue up a tcp_info notification (similar to
sock_diag_broadcast_destroy() today..)

Patch 1 in this set refactors the existing sock_diag code so that
the functions can be reused for notifications from other states than CLOSE.

Patch 2 provides a minor extension to tcp_call_bpf() so that it
will queue a tcp_info_notification if the BPF callout returns 
BPF_TCP_INFO_NOTIFY

Patch 3, provided strictly as a demonstration/PoC to aid in reviewing
this proposal, shows a simple sample/bpf example where we trigger the
tcp_info notification for an iperf connection if the number of 
retransmits exceeds 16.


Sowmini Varadhan (3):
  sock_diag: Refactor inet_sock_diag_destroy code
  tcp: BPF_TCP_INFO_NOTIFY support
  bpf: Added a sample for tcp_info_notify callback

 include/linux/sock_diag.h      |   18 +++++++---
 include/net/tcp.h              |   15 +++++++-
 include/uapi/linux/bpf.h       |    4 ++
 include/uapi/linux/sock_diag.h |    2 +
 net/core/sock.c                |    4 +-
 net/core/sock_diag.c           |   11 +++---
 samples/bpf/Makefile           |    1 +
 samples/bpf/tcp_notify_kern.c  |   73 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 114 insertions(+), 14 deletions(-)
 create mode 100644 samples/bpf/tcp_notify_kern.c

^ permalink raw reply

* [PATCH bpf] bpf: devmap: fix wrong interface selection in notifier_call
From: Taehee Yoo @ 2018-10-22 15:41 UTC (permalink / raw)
  To: daniel, ast; +Cc: netdev, ap420073

The dev_map_notification() removes interface in devmap if
unregistering interface's ifindex is same.
But only checking ifindex is not enough because other netns can have
same ifindex. so that wrong interface selection could occurred.
Hence the net_eq() is needed.

Fixes: 2ddf71e23cc2 ("net: add notifier hooks for devmap bpf map")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 kernel/bpf/devmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 141710b82a6c..0d9211e49a4a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -496,6 +496,7 @@ static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(netdev);
 	struct bpf_dtab *dtab;
 	int i;
 
@@ -512,7 +513,7 @@ static int dev_map_notification(struct notifier_block *notifier,
 				struct bpf_dtab_netdev *dev, *odev;
 
 				dev = READ_ONCE(dtab->netdev_map[i]);
-				if (!dev ||
+				if (!dev || !net_eq(net, dev_net(dev->dev)) ||
 				    dev->dev->ifindex != netdev->ifindex)
 					continue;
 				odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC PATCH v2 03/10] udp: add support for UDP_GRO cmsg
From: Paolo Abeni @ 2018-10-22 15:44 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, steffen.klassert
In-Reply-To: <CAF=yD-+JOJJX+vOGuV3z3aYNEdjNQ=U0+OaZdeuVvOTrRa-oRA@mail.gmail.com>

On Sun, 2018-10-21 at 16:07 -0400, Willem de Bruijn wrote:
> On Fri, Oct 19, 2018 at 10:30 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > When UDP GRO is enabled, the UDP_GRO cmsg will carry the ingress
> > datagram size. User-space can use such info to compute the original
> > packets layout.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > CHECK: should we use a separate setsockopt to explicitly enable
> > gso_size cmsg reception? So that user space can enable UDP_GRO and
> > fetch cmsg without forcefully receiving GRO related info.
> 
> A user can avoid the message by not passing control data. Though in
> most practical cases it seems unsafe to do so, anyway, as the path MTU
> can be lower than the expected device MTU.
> 
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 3c277378814f..2331ac9de954 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1714,6 +1714,10 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
> >                 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
> >                 *addr_len = sizeof(*sin);
> >         }
> > +
> > +       if (udp_sk(sk)->gro_enabled)
> > +               udp_cmsg_recv(msg, sk, skb);
> > +
> 
> Perhaps we can avoid adding a branch by setting a bit in
> inet->cmsg_flags for gso_size to let the below branch handle the cmsg
> processing.

Uhmm... I think that for ipv6 sockets we need to set a bit in rxopt
instead (and we already have some conditionals we could for ipv6 socket
recv cmsg processing).

> I'd still set that as part of the UDP_GRO setsockopt. Though if you
> insist it could be a value 2 instead of 1, effectively allowing the
> above mentioned opt-out.

I'm ok with the current impl (no additional value to opt-in the UDP GRO
cmsg).

Cheers,

Paolo

^ permalink raw reply

* Re: [PATCH] wireless: mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2018-10-23  0:07 UTC (permalink / raw)
  To: Johannes Berg, David S. Miller
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1530880164.3197.38.camel-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>



On 7/6/18 2:29 PM, Johannes Berg wrote:
> Hi Gustavo,
> 
>> In preparation to enabling -Wimplicit-fallthrough, mark switch cases
>> where we are expecting to fall through.
> 
> You dropped the remark saying you didn't review them, but did you?
> 

I'll add it in v2.

>>  	case NL80211_CHAN_WIDTH_20:
>>  		if (!ht_cap->ht_supported)
>>  			return false;
>> +		/* else: fall through */
> 
> What's the point in else:?
> 
> We also don't necessarily write
> 
> if (!...)
>   return false;
> else
>   do_something();
> 
> but rather
> 
> if (!...)
>   return false;
> do_something().
> 
> I think I'd prefer without the "else:"
> 

Sure thing. I'll change this in v2.

I'll send v2 shortly.

Thanks for the feedback.
--
Gustavo

^ permalink raw reply

* Re: [PATCH net-next 3/4] net: phy-c45: Implement reset/suspend/resume callbacks
From: Russell King - ARM Linux @ 2018-10-22 15:48 UTC (permalink / raw)
  To: Jose Abreu
  Cc: Andrew Lunn, netdev, Florian Fainelli, David S. Miller,
	Joao Pinto
In-Reply-To: <8114f345-0efb-9a64-b867-2cfe2fba09ab@synopsys.com>

On Mon, Oct 22, 2018 at 01:47:48PM +0100, Jose Abreu wrote:
> Hello,
> 
> On 22-10-2018 13:28, Andrew Lunn wrote:
> >>  EXPORT_SYMBOL_GPL(gen10g_resume);
> >> @@ -327,7 +381,7 @@ struct phy_driver genphy_10g_driver = {
> >>  	.phy_id         = 0xffffffff,
> >>  	.phy_id_mask    = 0xffffffff,
> >>  	.name           = "Generic 10G PHY",
> >> -	.soft_reset	= gen10g_no_soft_reset,
> >> +	.soft_reset	= gen10g_soft_reset,
> >>  	.config_init    = gen10g_config_init,
> >>  	.features       = 0,
> >>  	.aneg_done	= genphy_c45_aneg_done,
> > Hi Jose
> >
> > You need to be careful here. There is a reason this is called
> > gen10g_no_soft_reset, rather than having an empty
> > gen10g_soft_reset. Some PHYs break when you do a reset.  So adding a
> > gen10g_soft_reset is fine, but don't change this here, without first
> > understanding the history, and talking to Russell King.
> 
> Hmm, the reset function only interacts with standard PCS
> registers, which should always be available ...
> 
> >From my tests I need to do at least 1 reset during power-up so in
> ultimate case I can add a feature quirk or similar.
> 
> Russell, can you please comment ?

Setting the reset bit on 88x3310 causes the entire device to become
completely inaccessible until hardware reset.  Therefore, this bit
must _never_ be set for these devices.  That said, we have a separate
driver for these PHYs, but that will only be used for them if it's
present in the kernel.  If we accidentally fall back to the generic
driver, then we'll screw the 88x3310 until a full hardware reset.

We also have a bunch of net devices that make use of this crippled
"generic" 10G support - we don't know whether resetting the PHY
for those systems will cause a regression - maybe board firmware
already configured the PHY?  I can't say either way on that, except
that we've had crippled 10G support in PHYLIB for a number of years
now _with_ users, and adding reset support drastically changes the
subsystem's behaviour for these users.

I would recommend not touching the generic 10G driver, but instead
implement your own driver for your PHY to avoid causing regressions.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

^ permalink raw reply

* Re: [RFC PATCH v2 02/10] udp: implement GRO for plain UDP sockets.
From: Willem de Bruijn @ 2018-10-22 15:51 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: steffen.klassert, Network Development, Willem de Bruijn
In-Reply-To: <b69077c93ec047845d4b22a57fa6f89b63c0639c.camel@redhat.com>

> >
> > > +static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
> > > +                                          struct sk_buff *skb)
> > > +{
> > > +   struct udphdr *uh = udp_hdr(skb);
> > > +   struct sk_buff *pp = NULL;
> > > +   struct udphdr *uh2;
> > > +   struct sk_buff *p;
> > > +
> > > +   /* requires non zero csum, for simmetry with GSO */
> > > +   if (!uh->check) {
> > > +           NAPI_GRO_CB(skb)->flush = 1;
> > > +           return NULL;
> > > +   }
> >
> > Why is the requirement of checksums different than in
> > udp_gro_receive? It's not that I care much about UDP
> > packets without a checksum, but you would not need
> > to implement your own loop if the requirement could
> > be the same as in udp_gro_receive.

It would be nice if we could deduplicate the loops, but even without
the checksum difference they look to me a bit too different for it to be
practical, also with the constraints on segment length and max aggregation.

> uhm....
> AFAIU, we need to generated aggregated packets that UDP GSO is able to
> process/segment. I was unable to get a nocsum packet segment (possibly
> PEBKAC) so I enforced that condition on the rx path.
>
> @Willem: did I see ghost here? is UDP_SEGMENT fine with no checksum
> segment?

udp_send_skb fails with EIO if ip_summed is anything but CHECKSUM_PARTIAL.

but that's not in the forwarding path. Still, __udp_gso_segment as is
depends on that invariant and will not handle packets with zero
checksum correctly. It unconditionally adjusts uh->check. That could
be changed, of course.

^ permalink raw reply

* [PATCH v2] wireless: mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2018-10-23  0:13 UTC (permalink / raw)
  To: Johannes Berg, David S. Miller
  Cc: linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva,
	Kees Cook

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Warning level 3 was used: -Wimplicit-fallthrough=3

This code was not tested and GCC 7.2.0 was used to compile it.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
Changes in v2:
 - Explicity mention in the commit log that this the code was not
   tested.
 - Use warning level 3 instead of 2.
 - Change the form "else: fall through" to "fall through" instead.
 - Address -Wimplicit-fallthrough warnings in net/wireless/scan.c

 net/wireless/chan.c        | 2 ++
 net/wireless/nl80211.c     | 9 +++++++++
 net/wireless/scan.c        | 2 +-
 net/wireless/wext-compat.c | 2 ++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 2db713d..0a45265 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -747,6 +747,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 	case NL80211_CHAN_WIDTH_20:
 		if (!ht_cap->ht_supported)
 			return false;
+		/* fall through */
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		prohibited_flags |= IEEE80211_CHAN_NO_20MHZ;
 		width = 20;
@@ -769,6 +770,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
 		if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
 			return false;
+		/* fall through */
 	case NL80211_CHAN_WIDTH_80:
 		if (!vht_cap->vht_supported)
 			return false;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 744b585..0276370 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1706,6 +1706,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 1:
 		if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES,
 			    sizeof(u32) * rdev->wiphy.n_cipher_suites,
@@ -1752,6 +1753,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 2:
 		if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES,
 					rdev->wiphy.interface_modes))
@@ -1759,6 +1761,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 3:
 		nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS);
 		if (!nl_bands)
@@ -1784,6 +1787,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 				state->chan_start++;
 				if (state->split)
 					break;
+				/* fall through */
 			default:
 				/* add frequencies */
 				nl_freqs = nla_nest_start(
@@ -1837,6 +1841,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 4:
 		nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS);
 		if (!nl_cmds)
@@ -1863,6 +1868,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 5:
 		if (rdev->ops->remain_on_channel &&
 		    (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) &&
@@ -1880,6 +1886,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 6:
 #ifdef CONFIG_PM
 		if (nl80211_send_wowlan(msg, rdev, state->split))
@@ -1890,6 +1897,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 #else
 		state->split_start++;
 #endif
+		/* fall through */
 	case 7:
 		if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES,
 					rdev->wiphy.software_iftypes))
@@ -1902,6 +1910,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		state->split_start++;
 		if (state->split)
 			break;
+		/* fall through */
 	case 8:
 		if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) &&
 		    nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME,
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index d0e7472..0f0b519 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1183,7 +1183,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
 	switch (ftype) {
 	case CFG80211_BSS_FTYPE_BEACON:
 		ies->from_beacon = true;
-		/* fall through to assign */
+		/* fall through - to assign */
 	case CFG80211_BSS_FTYPE_UNKNOWN:
 		rcu_assign_pointer(tmp.pub.beacon_ies, ies);
 		break;
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 06943d9..d522787 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1337,6 +1337,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 			wstats.qual.qual = sig + 110;
 			break;
 		}
+		/* fall through */
 	case CFG80211_SIGNAL_TYPE_UNSPEC:
 		if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
 			wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
@@ -1345,6 +1346,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 			wstats.qual.qual = sinfo.signal;
 			break;
 		}
+		/* fall through */
 	default:
 		wstats.qual.updated |= IW_QUAL_LEVEL_INVALID;
 		wstats.qual.updated |= IW_QUAL_QUAL_INVALID;
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC PATCH v2 06/10] udp: cope with UDP GRO packet misdirection
From: Willem de Bruijn @ 2018-10-22 16:00 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: Network Development, Willem de Bruijn, steffen.klassert
In-Reply-To: <bde22308f9107b25e3e21f87090be02e0367330b.camel@redhat.com>

On Mon, Oct 22, 2018 at 6:29 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Sun, 2018-10-21 at 16:08 -0400, Willem de Bruijn wrote:
> > On Fri, Oct 19, 2018 at 10:31 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > In some scenarios, the GRO engine can assemble an UDP GRO packet
> > > that ultimately lands on a non GRO-enabled socket.
> > > This patch tries to address the issue explicitly checking for the UDP
> > > socket features before enqueuing the packet, and eventually segmenting
> > > the unexpected GRO packet, as needed.
> > >
> > > We must also cope with re-insertion requests: after segmentation the
> > > UDP code calls the helper introduced by the previous patches, as needed.
> > >
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > > +static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
> > > +                                             struct sk_buff *skb)
> > > +{
> > > +       struct sk_buff *segs;
> > > +
> > > +       /* the GSO CB lays after the UDP one, no need to save and restore any
> > > +        * CB fragment, just initialize it
> > > +        */
> > > +       segs = __skb_gso_segment(skb, NETIF_F_SG, false);
> > > +       if (unlikely(IS_ERR(segs)))
> > > +               kfree_skb(skb);
> > > +       else if (segs)
> > > +               consume_skb(skb);
> > > +       return segs;
> > > +}
> > > +
> > > +
> > > +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
> > > +
> > > +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
> > > +{
> > > +       struct sk_buff *next, *segs;
> > > +       int ret;
> > > +
> > > +       if (likely(!udp_unexpected_gso(sk, skb)))
> > > +               return udp_queue_rcv_one_skb(sk, skb);
> > > +
> > > +       BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
> > > +       __skb_push(skb, -skb_mac_offset(skb));
> > > +       segs = udp_rcv_segment(sk, skb);
> > > +       for (skb = segs; skb; skb = next) {
> >
> > need to check IS_ERR(segs) again?
>
> whooops ... yes, I think so, thanks for catching it.
>
> Since the error code is always discarded, perhpas udp_rcv_segment() can
> simply return 0 when IS_ERR(segs) is true, so we can save a conditional
> here. This is currently a slower/exceptional path, but if we will
> enable UDP GRO for forwaded packets, it will be hit often.

That sounds fine.

In udp_rcv_segment, we should probably account the dropped segments
to UDP_MIB_INERRORS and sk_drops.

^ permalink raw reply

* Re: [RFC PATCH v2 01/10] udp: implement complete book-keeping for encap_needed
From: Willem de Bruijn @ 2018-10-22 16:06 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: Network Development, Willem de Bruijn, steffen.klassert
In-Reply-To: <a9c6ce40a0848775c7490dcd2722c33c691732ea.1539957909.git.pabeni@redhat.com>

On Fri, Oct 19, 2018 at 10:30 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> The *encap_needed static keys are enabled by UDP tunnels
> and several UDP encapsulations type, but they are never
> turned off. This can cause unneeded overall performance
> degradation for systems where such features are used
> transiently.
>
> This patch introduces complete book-keeping for such keys,
> decreasing the usage at socket destruction time, if needed,
> and avoiding that the same socket could increase the key
> usage multiple times.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  include/linux/udp.h      |  7 ++++++-
>  include/net/udp_tunnel.h |  6 ++++++
>  net/ipv4/udp.c           | 18 ++++++++++++------
>  net/ipv6/udp.c           | 14 +++++++++-----
>  4 files changed, 33 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/udp.h b/include/linux/udp.h
> index 320d49d85484..a4dafff407fb 100644
> --- a/include/linux/udp.h
> +++ b/include/linux/udp.h
> @@ -49,7 +49,12 @@ struct udp_sock {
>         unsigned int     corkflag;      /* Cork is required */
>         __u8             encap_type;    /* Is this an Encapsulation socket? */
>         unsigned char    no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
> -                        no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
> +                        no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
> +                        encap_enabled:1; /* This socket enabled encap
> +                                          * processing; UDP tunnels and
> +                                          * different encapsulation layer set
> +                                          * this
> +                                          */
>         /*
>          * Following member retains the information to create a UDP header
>          * when the socket is uncorked.
> diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
> index fe680ab6b15a..3fbe56430e3b 100644
> --- a/include/net/udp_tunnel.h
> +++ b/include/net/udp_tunnel.h
> @@ -165,6 +165,12 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
>
>  static inline void udp_tunnel_encap_enable(struct socket *sock)
>  {
> +       struct udp_sock *up = udp_sk(sock->sk);
> +
> +       if (up->encap_enabled)
> +               return;
> +
> +       up->encap_enabled = 1;
>  #if IS_ENABLED(CONFIG_IPV6)
>         if (sock->sk->sk_family == PF_INET6)
>                 ipv6_stub->udpv6_encap_enable();
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index cf8252d05a01..9fcb5374e166 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -2382,11 +2382,15 @@ void udp_destroy_sock(struct sock *sk)
>         bool slow = lock_sock_fast(sk);
>         udp_flush_pending_frames(sk);
>         unlock_sock_fast(sk, slow);
> -       if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
> -               void (*encap_destroy)(struct sock *sk);
> -               encap_destroy = READ_ONCE(up->encap_destroy);
> -               if (encap_destroy)
> -                       encap_destroy(sk);
> +       if (static_branch_unlikely(&udp_encap_needed_key)) {
> +               if (up->encap_type) {
> +                       void (*encap_destroy)(struct sock *sk);
> +                       encap_destroy = READ_ONCE(up->encap_destroy);
> +                       if (encap_destroy)
> +                               encap_destroy(sk);
> +               }
> +               if (up->encap_enabled)
> +                       static_branch_disable(&udp_encap_needed_key);
>         }
>  }
>
> @@ -2431,7 +2435,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
>                         /* FALLTHROUGH */
>                 case UDP_ENCAP_L2TPINUDP:
>                         up->encap_type = val;
> -                       udp_encap_enable();
> +                       if (!up->encap_enabled)
> +                               udp_encap_enable();
> +                       up->encap_enabled = 1;

nit: no need for the branch: udp_encap_enable already has a branch and
is static inline.

Perhaps it makes sense to convert that to take the udp_sock and handle
the state change within, to avoid having to open code at multiple
locations.

^ permalink raw reply

* Re: [PATCH 13/17] octeontx2-af: Install ucast and bcast pkt forwarding rules
From: kbuild test robot @ 2018-10-22 16:18 UTC (permalink / raw)
  To: sunil.kovvuri; +Cc: kbuild-all, netdev, davem, arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-14-git-send-email-sunil.kovvuri@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2712 bytes --]

Hi Sunil,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20181019]
[cannot apply to v4.19]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/sunil-kovvuri-gmail-com/octeontx2-af-NPC-parser-and-NIX-blocks-initialization/20181021-094953
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=arm64 

All errors (new ones prefixed by >>):

   In file included from drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c:17:0:
>> drivers/net/ethernet/marvell/octeontx2/af/npc.h:251:2: error: expected ',', ';' or '}' before 'u64'
     u64 op  :4;
     ^~~
--
   In file included from drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c:17:0:
>> drivers/net/ethernet/marvell/octeontx2/af/npc.h:251:2: error: expected ',', ';' or '}' before 'u64'
     u64 op  :4;
     ^~~
   drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c: In function 'rvu_npc_install_ucast_entry':
>> drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c:304:9: error: 'struct nix_rx_action' has no member named 'op'
      action.op = NIX_RX_ACTIONOP_UCAST;
            ^
   drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c: In function 'rvu_npc_install_bcast_match_entry':
   drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c:348:8: error: 'struct nix_rx_action' has no member named 'op'
     action.op = NIX_RX_ACTIONOP_UCAST;
           ^
   drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c: In function 'rvu_npc_disable_mcam_entries':
   drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c:385:13: error: 'struct nix_rx_action' has no member named 'op'
      if (action.op != NIX_RX_ACTIONOP_MCAST)
                ^

vim +251 drivers/net/ethernet/marvell/octeontx2/af/npc.h

   243	
   244	struct nix_rx_action {
   245	#if defined(__BIG_ENDIAN_BITFIELD)
   246		u64	rsvd_63_61	:3;
   247		u64	flow_key_alg	:5;
   248		u64	match_id	:16;
   249		u64	index		:20;
   250		u64	pf_func		:16
 > 251		u64	op		:4;
   252	#else
   253		u64	op		:4;
   254		u64	pf_func		:16;
   255		u64	index		:20;
   256		u64	match_id	:16;
   257		u64	flow_key_alg	:5;
   258		u64	rsvd_63_61	:3;
   259	#endif
   260	};
   261	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 61155 bytes --]

^ permalink raw reply

* Re: [PATCH net 1/4] net/sched: act_gact: disallow 'goto chain' on fallback control action
From: Cong Wang @ 2018-10-22 16:22 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers
In-Reply-To: <02f04ade8a0087781778d02fbb645b1d72f9d777.1540070509.git.dcaratti@redhat.com>

On Sat, Oct 20, 2018 at 2:33 PM Davide Caratti <dcaratti@redhat.com> wrote:
>
> in the following command:
>
>  # tc action add action <c1> random <rand_type> <c2> <rand_param>
>
> 'goto chain x' is allowed only for c1: setting it for c2 makes the kernel
> crash with NULL pointer dereference, since TC core doesn't initialize the
> chain handle.
>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

^ permalink raw reply

* Re: [PATCH net 2/4] net/sched: act_police: disallow 'goto chain' on fallback control action
From: Cong Wang @ 2018-10-22 16:23 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jiri Pirko, Jamal Hadi Salim, David Miller,
	Linux Kernel Network Developers
In-Reply-To: <c2f076b0758b8cd997574b45c2abe064e28aca74.1540070509.git.dcaratti@redhat.com>

On Sat, Oct 20, 2018 at 2:33 PM Davide Caratti <dcaratti@redhat.com> wrote:
>
> in the following command:
>
>  # tc action add action police rate <r> burst <b> conform-exceed <c1>/<c2>
>
> 'goto chain x' is allowed only for c1: setting it for c2 makes the kernel
> crash with NULL pointer dereference, since TC core doesn't initialize the
> chain handle.
>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

^ permalink raw reply

* [PATCH net-next] llc: do not use sk_eat_skb()
From: Eric Dumazet @ 2018-10-22 16:24 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

syzkaller triggered a use-after-free [1], caused by a combination of
skb_get() in llc_conn_state_process() and usage of sk_eat_skb()

sk_eat_skb() is assuming the skb about to be freed is only used by
the current thread. TCP/DCCP stacks enforce this because current
thread holds the socket lock.

llc_conn_state_process() wants to make sure skb does not disappear,
and holds a reference on the skb it manipulates. But as soon as this
skb is added to socket receive queue, another thread can consume it.

This means that llc must use regular skb_unlink() and kfree_skb()
so that both producer and consumer can safely work on the same skb.

[1]
BUG: KASAN: use-after-free in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: use-after-free in refcount_read include/linux/refcount.h:43 [inline]
BUG: KASAN: use-after-free in skb_unref include/linux/skbuff.h:967 [inline]
BUG: KASAN: use-after-free in kfree_skb+0xb7/0x580 net/core/skbuff.c:655
Read of size 4 at addr ffff8801d1f6fba4 by task ksoftirqd/1/18

CPU: 1 PID: 18 Comm: ksoftirqd/1 Not tainted 4.19.0-rc8+ #295
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c4/0x2b6 lib/dump_stack.c:113
 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
 refcount_read include/linux/refcount.h:43 [inline]
 skb_unref include/linux/skbuff.h:967 [inline]
 kfree_skb+0xb7/0x580 net/core/skbuff.c:655
 llc_sap_state_process+0x9b/0x550 net/llc/llc_sap.c:224
 llc_sap_rcv+0x156/0x1f0 net/llc/llc_sap.c:297
 llc_sap_handler+0x65e/0xf80 net/llc/llc_sap.c:438
 llc_rcv+0x79e/0xe20 net/llc/llc_input.c:208
 __netif_receive_skb_one_core+0x14d/0x200 net/core/dev.c:4913
 __netif_receive_skb+0x2c/0x1e0 net/core/dev.c:5023
 process_backlog+0x218/0x6f0 net/core/dev.c:5829
 napi_poll net/core/dev.c:6249 [inline]
 net_rx_action+0x7c5/0x1950 net/core/dev.c:6315
 __do_softirq+0x30c/0xb03 kernel/softirq.c:292
 run_ksoftirqd+0x94/0x100 kernel/softirq.c:653
 smpboot_thread_fn+0x68b/0xa00 kernel/smpboot.c:164
 kthread+0x35a/0x420 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413

Allocated by task 18:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc_node+0x144/0x730 mm/slab.c:3644
 __alloc_skb+0x119/0x770 net/core/skbuff.c:193
 alloc_skb include/linux/skbuff.h:995 [inline]
 llc_alloc_frame+0xbc/0x370 net/llc/llc_sap.c:54
 llc_station_ac_send_xid_r net/llc/llc_station.c:52 [inline]
 llc_station_rcv+0x1dc/0x1420 net/llc/llc_station.c:111
 llc_rcv+0xc32/0xe20 net/llc/llc_input.c:220
 __netif_receive_skb_one_core+0x14d/0x200 net/core/dev.c:4913
 __netif_receive_skb+0x2c/0x1e0 net/core/dev.c:5023
 process_backlog+0x218/0x6f0 net/core/dev.c:5829
 napi_poll net/core/dev.c:6249 [inline]
 net_rx_action+0x7c5/0x1950 net/core/dev.c:6315
 __do_softirq+0x30c/0xb03 kernel/softirq.c:292

Freed by task 16383:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x83/0x290 mm/slab.c:3756
 kfree_skbmem+0x154/0x230 net/core/skbuff.c:582
 __kfree_skb+0x1d/0x20 net/core/skbuff.c:642
 sk_eat_skb include/net/sock.h:2366 [inline]
 llc_ui_recvmsg+0xec2/0x1610 net/llc/af_llc.c:882
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg+0xd0/0x110 net/socket.c:801
 ___sys_recvmsg+0x2b6/0x680 net/socket.c:2278
 __sys_recvmmsg+0x303/0xb90 net/socket.c:2390
 do_sys_recvmmsg+0x181/0x1a0 net/socket.c:2466
 __do_sys_recvmmsg net/socket.c:2484 [inline]
 __se_sys_recvmmsg net/socket.c:2480 [inline]
 __x64_sys_recvmmsg+0xbe/0x150 net/socket.c:2480
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8801d1f6fac0
 which belongs to the cache skbuff_head_cache of size 232
The buggy address is located 228 bytes inside of
 232-byte region [ffff8801d1f6fac0, ffff8801d1f6fba8)
The buggy address belongs to the page:
page:ffffea000747dbc0 count:1 mapcount:0 mapping:ffff8801d9be7680 index:0xffff8801d1f6fe80
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0007346e88 ffffea000705b108 ffff8801d9be7680
raw: ffff8801d1f6fe80 ffff8801d1f6f0c0 000000010000000b 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d1f6fa80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8801d1f6fb00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801d1f6fb80: fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc
                               ^
 ffff8801d1f6fc00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801d1f6fc80: fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/llc/af_llc.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9549fa6ec1f7b0e5f9af8ff3250a316f59..b99e73a7e7e0f2b4959b279e3aecbadf29667d55 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -730,7 +730,6 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	struct sk_buff *skb = NULL;
 	struct sock *sk = sock->sk;
 	struct llc_sock *llc = llc_sk(sk);
-	unsigned long cpu_flags;
 	size_t copied = 0;
 	u32 peek_seq = 0;
 	u32 *seq, skb_len;
@@ -855,9 +854,8 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			goto copy_uaddr;
 
 		if (!(flags & MSG_PEEK)) {
-			spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-			sk_eat_skb(sk, skb);
-			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+			skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
 			*seq = 0;
 		}
 
@@ -878,9 +876,8 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		llc_cmsg_rcv(msg, skb);
 
 	if (!(flags & MSG_PEEK)) {
-		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-		sk_eat_skb(sk, skb);
-		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+		skb_unlink(skb, &sk->sk_receive_queue);
+		kfree_skb(skb);
 		*seq = 0;
 	}
 
-- 
2.19.1.568.g152ad8e336-goog

^ permalink raw reply related

* Re: [PATCH iproute2-next] Tree wide: Drop sockaddr_nl arg
From: Stephen Hemminger @ 2018-10-22 16:29 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, David Ahern
In-Reply-To: <20181019204418.14218-1-dsahern@kernel.org>

On Fri, 19 Oct 2018 13:44:18 -0700
David Ahern <dsahern@kernel.org> wrote:

> From: David Ahern <dsahern@gmail.com>
> 
> No command, filter, or print function uses the sockaddr_nl arg,
> so just drop it.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox