Netdev List
 help / color / mirror / Atom feed
* Re: KMSAN: uninit-value in pppoe_rcv
From: Guillaume Nault @ 2018-09-13 16:19 UTC (permalink / raw)
  To: Alexander Potapenko
  Cc: Eric Dumazet, syzbot+f5f6080811c849739212, LKML, mostrows,
	Networking, syzkaller-bugs
In-Reply-To: <CAG_fn=WXPsQBWR1GDsprY3aL4oW0v6EGhNB5wrvPJCOMh_U7wQ@mail.gmail.com>

On Thu, Sep 13, 2018 at 04:12:38PM +0200, Alexander Potapenko wrote:
> On Thu, Sep 13, 2018 at 3:57 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> >
> > On 09/12/2018 03:38 AM, Alexander Potapenko wrote:
> > > On Wed, Sep 12, 2018 at 12:24 PM syzbot
> > > <syzbot+f5f6080811c849739212@syzkaller.appspotmail.com> wrote:
> > >>
> > >> Hello,
> > >>
> > >> syzbot found the following crash on:
> > >>
> > >> HEAD commit:    d2d741e5d189 kmsan: add initialization for shmem pages
> > >> git tree:       https://github.com/google/kmsan.git/master
> > >> console output: https://syzkaller.appspot.com/x/log.txt?x=1465fc37800000
> > >> kernel config:  https://syzkaller.appspot.com/x/.config?x=48f9de3384bcd0f
> > >> dashboard link: https://syzkaller.appspot.com/bug?extid=f5f6080811c849739212
> > >> compiler:       clang version 7.0.0 (trunk 329391)
> > >> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=14d6e607800000
> > >> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10a15b5b800000
> > >>
> > >> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > >> Reported-by: syzbot+f5f6080811c849739212@syzkaller.appspotmail.com
> > >>
> > >> IPVS: ftp: loaded support on port[0] = 21
> > >> ==================================================================
> > >> BUG: KMSAN: uninit-value in __get_item drivers/net/ppp/pppoe.c:172 [inline]
> > >> BUG: KMSAN: uninit-value in get_item drivers/net/ppp/pppoe.c:236 [inline]
> > >> BUG: KMSAN: uninit-value in pppoe_rcv+0xcef/0x10e0
> > >> drivers/net/ppp/pppoe.c:450
> > >> CPU: 0 PID: 4543 Comm: syz-executor355 Not tainted 4.16.0+ #87
> > >> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > >> Google 01/01/2011
> > >> Call Trace:
> > >>   __dump_stack lib/dump_stack.c:17 [inline]
> > >>   dump_stack+0x185/0x1d0 lib/dump_stack.c:53
> > >>   kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
> > >>   __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
> > >>   __get_item drivers/net/ppp/pppoe.c:172 [inline]
> > >>   get_item drivers/net/ppp/pppoe.c:236 [inline]
> > >>   pppoe_rcv+0xcef/0x10e0 drivers/net/ppp/pppoe.c:450
> > >>   __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562
> > >>   __netif_receive_skb net/core/dev.c:4627 [inline]
> > >>   netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701
> > >>   netif_receive_skb+0x230/0x240 net/core/dev.c:4725
> > >>   tun_rx_batched drivers/net/tun.c:1555 [inline]
> > >>   tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962
> > >>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
> > >>   call_write_iter include/linux/fs.h:1782 [inline]
> > >>   new_sync_write fs/read_write.c:469 [inline]
> > >>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
> > >>   vfs_write+0x463/0x8d0 fs/read_write.c:544
> > >>   SYSC_write+0x172/0x360 fs/read_write.c:589
> > >>   SyS_write+0x55/0x80 fs/read_write.c:581
> > >>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
> > >>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> > >> RIP: 0033:0x4447c9
> > >> RSP: 002b:00007fff64c8fc28 EFLAGS: 00000297 ORIG_RAX: 0000000000000001
> > >> RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004447c9
> > >> RDX: 000000000000fd87 RSI: 0000000020000600 RDI: 0000000000000004
> > >> RBP: 00000000006cf018 R08: 00007fff64c8fda8 R09: 00007fff00006bda
> > >> R10: 0000000000005fe7 R11: 0000000000000297 R12: 00000000004020d0
> > >> R13: 0000000000402160 R14: 0000000000000000 R15: 0000000000000000
> > >>
> > >> Uninit was created at:
> > >>   kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
> > >>   kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
> > >>   kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
> > >>   kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
> > >>   slab_post_alloc_hook mm/slab.h:445 [inline]
> > >>   slab_alloc_node mm/slub.c:2737 [inline]
> > >>   __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
> > >>   __kmalloc_reserve net/core/skbuff.c:138 [inline]
> > >>   __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
> > >>   alloc_skb include/linux/skbuff.h:984 [inline]
> > >>   alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
> > >>   sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
> > >>   tun_alloc_skb drivers/net/tun.c:1532 [inline]
> > >>   tun_get_user+0x2242/0x7c60 drivers/net/tun.c:1829
> > >>   tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
> > >>   call_write_iter include/linux/fs.h:1782 [inline]
> > >>   new_sync_write fs/read_write.c:469 [inline]
> > >>   __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
> > >>   vfs_write+0x463/0x8d0 fs/read_write.c:544
> > >>   SYSC_write+0x172/0x360 fs/read_write.c:589
> > >>   SyS_write+0x55/0x80 fs/read_write.c:581
> > >>   do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
> > >>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> > >> ==================================================================
> > > I did a little digging before sending the bug upstream.
> > > If I add memset(obj, 0xfe, size) to __kmalloc_reserve(), these 0xfe
> > > bytes are visible in __get_item() at the place where KMSAN reports an
> > > error.
> > >
> > > The problem is somehow related to tun_get_user() creating a fragmented
> > > sk_buff - when I change the call to tun_alloc_skb() so that it
> > > allocates a single buffer the bug goes away.
> > >
> >
> > I guess the following patch would fix the issue
> >
> > (I will submit it more formally)
> No, as far as I can see it doesn't.
> Saving sid before __skb_pull() is still a good idea, but in this
> particular case |ph| doesn't change.

Yes, we probably need to save sid.

But I think the problem found by syzbot is related to
eth_hdr(skb)->h_source. PPPoE expects that Ethernet header has already
been parsed and is accessible at skb_mac_header(skb).
But here skb_mac_header(skb) == skb->data, and we may pull only 6 bytes
(sizeof(truct pppoe_hdr)). Therefore eth_hdr(skb)->h_source points past
skb's head length.

Not sure if something needs to be changed in tun.c for properly setting
skb_mac_header. But PPPoE has no reason to consider packets from
non-Ethernet devices anyway.

^ permalink raw reply

* Re: [PATCH] knav: qmss: Introduce queue descriptors monitor
From: David Miller @ 2018-09-13 16:07 UTC (permalink / raw)
  To: gomonovych
  Cc: w-kwok2, m-karicheri2, grygorii.strashko, vasyl.gomonovych,
	ssantosh, linux-kernel, netdev, linux-arm-kernel
In-Reply-To: <20180911211549.12092-1-gomonovych@gmail.com>

From: Vasyl Gomonovych <gomonovych@gmail.com>
Date: Tue, 11 Sep 2018 23:15:47 +0200

> Monitor and record available descriptors in knav_qmss queues
> Get amount of available descriptors in free-descriptor queue
> base on event-triggered RX traffic.
> Also monitor free-descriptor queue base on periodic time interval
> in kernel thread.
> To start monitoring available descriptors in queue earlyi,
> module parameters, enable start monitoring in boottime
> 
> This queue descriptor monitor helps debugging starvation issue.
> The monitor should help debug queue under traffic pressure
> and can describe the shape of this pressure when a queue
> faced descriptors starvation.
> Monitor helpful for IP blocks which do not have dedicated
> descriptor starvation interrupt like RapidIO IP.
> 
> Registration and enable file in debugfs hierarchy
> 
> |-/sys/kernel/debug
> |-- knav_qmssm_soc:hwqueue@2a40000
> |   |-- 8710
> |   |   |-- buffer_size
> |   |   |-- enable
> |   |   |-- monitor_stats
> |   |   -- unregister
> 
> ---
> 
> The current implementation is the first iteration
> and require additional work.
> By this patch I would like to know does this could be
> helpful for other components and continue my work in a right way.
> 
> Signed-off-by: Vasyl Gomonovych <vasyl.gomonovych@nokia.com>

This is way over engineered for just keeping some statistics around.

Just have a periodic timer or a workqueue that does the necessary
sampling.

^ permalink raw reply

* Re: [PATCH v2] neighbour: confirm neigh entries when ARP packet is received
From: David Miller @ 2018-09-13 16:04 UTC (permalink / raw)
  To: vasilykh
  Cc: roopa, adobriyan, edumazet, stephen, jwestfall, w.bumiller,
	anarsoul, keescook, ihrachys, netdev, linux-kernel
In-Reply-To: <20180911180406.31283-1-vasilykh@arista.com>

From: Vasily Khoruzhick <vasilykh@arista.com>
Date: Tue, 11 Sep 2018 11:04:06 -0700

> Update 'confirmed' timestamp when ARP packet is received. It shouldn't
> affect locktime logic and anyway entry can be confirmed by any higher-layer
> protocol. Thus it makes to sense not to confirm it when ARP packet is
> received.
> 
> Fixes: 77d7123342 ("neighbour: update neigh timestamps iff update is
> effective")
> 
> Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
> ---
> v2: - update comment to match new code.

Please fix the wording in this commit message, as per Sergei's
feedback.

Also, the Fixes: tag should be all one line (people grep for these
strings in the repository) and with no empty lines between it and the
rest of the tags such as Signed-off-by:

Thanks.

^ permalink raw reply

* Re: [PATCH 1/2] netlink: add NLA_REJECT policy type
From: Michal Kubecek @ 2018-09-13 10:49 UTC (permalink / raw)
  To: Johannes Berg; +Cc: linux-wireless, netdev, Johannes Berg
In-Reply-To: <20180913084603.7979-1-johannes@sipsolutions.net>

On Thu, Sep 13, 2018 at 10:46:02AM +0200, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
> 
> In some situations some netlink attributes may be used for output
> only (kernel->userspace) or may be reserved for future use. It's
> then helpful to be able to prevent userspace from using them in
> messages sent to the kernel, since they'd otherwise be ignored and
> any future will become impossible if this happens.
> 
> Add NLA_REJECT to the policy which does nothing but reject (with
> EINVAL) validation of any messages containing this attribute.
> Allow for returning a specific extended ACK error message in the
> validation_data pointer.
> 
> While at it fix the indentation of NLA_BITFIELD32 and describe the
> validation_data pointer for it.
> 
> The specific case I have in mind now is a shared nested attribute
> containing request/response data, and it would be pointless and
> potentially confusing to have userspace include response data in
> the messages that actually contain a request.
> 
> Signed-off-by: Johannes Berg <johannes.berg@intel.com>
> ---
> ...
> @@ -251,10 +256,13 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
>  
>  		if (type > 0 && type <= maxtype) {
>  			if (policy) {
> -				err = validate_nla(nla, maxtype, policy);
> +				err = validate_nla(nla, maxtype, policy,
> +						   extack);
>  				if (err < 0) {
> -					NL_SET_ERR_MSG_ATTR(extack, nla,
> -							    "Attribute failed policy validation");
> +					NL_SET_BAD_ATTR(extack, nla);
> +					if (extack && !extack->_msg)
> +						NL_SET_ERR_MSG(extack,
> +							       "Attribute failed policy validation");
>  					goto errout;
>  				}
>  			}
> -- 

Technically, this would change the outcome when nla_parse() is called
with extack->_msg already set nad validate_nla() fails on something else
than NLA_REJECT; it will preserve the previous message in such case.
But I don't think this is a serious problem.

Reviewed-by: Michal Kubecek <mkubecek@suse.cz>

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 15:58 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu9GVbSETjp01tANMwJgA6O9aexhnH+47836KjZg+71q2A@mail.gmail.com>

On Thu, Sep 13, 2018 at 5:43 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> I'd prefer it if all the accelerated software implementations live in
> the same place. But I do strongly prefer arch code to live in
> arch/$arch

Zinc follows the scheme of the raid6 code, as well as of most other
crypto libraries: code is grouped by cipher, making it easy for people
to work with and understand differing implementations. It also allows
us to trivially link these together at compile time rather than at
link time, which makes cipher selection much more efficient. It's
really much more maintainable this way.

> I think AES-GCM is a useful example here. I really like the SIMD token
> abstraction a lot, but I would like to understand how this would work
> in Zinc if you have
> a) a generic implementation
> b) perhaps an arch specific scalar implementation
> c) a pure NEON implementation
> d) an implementation using AES instructions but not the PMULL instructions
> e) an implementation that uses AES and PMULL instructions.

The same way that Zinc currently chooses between the five different
implementations for, say, x86_64 ChaCha20:

- Generic C scalar
- SSSE3
- AVX2
- AVX512F
- AVX512VL

We make a decision based on CPU capabilities, SIMD context, and input
length, and then choose the right function.

> You know what? If you're up for it, let's not wait until Plumbers, but
> instead, let's collaborate off list to get this into shape.

Sure, sounds good.

Jason

^ permalink raw reply

* Re: Regression: kernel 4.14 an later very slow with many ipsec tunnels
From: Wolfgang Walter @ 2018-09-13 15:46 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netdev, Steffen Klassert, David Miller, Linux Kernel Mailing List,
	Linus Torvalds
In-Reply-To: <20180913135844.3ut6fxgx67t6ndtu@breakpoint.cc>

Am Donnerstag, 13. September 2018, 15:58:44 schrieb Florian Westphal:
> Wolfgang Walter <linux@stwm.de> wrote:
> > thanks to the fix from Steffen Klassert I could now run 4.14.69 + his
> > patch
> > and 4.18.7 + his patch without oopsing immediately.
> > 
> > But I found that those kernels perform very bad. They perform so bad that
> > they are unusable for our router with about 3000 ipsec tunnels (tunnel
> > mode network <-> network).
> 
> Can you do a 'perf record -a -g sleep 5' with 4.18 and provide 'perf
> report' result?
> 
> It would be good to see where those cycles are spent.

I'll try that but this isn't that easy as the router image does not contain 
perf. I also have to do that on our production router. I try to do that 
tomorrow evening.

What I can say is that it depends mainly on number of policy rules and SA.

Regards
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 15:45 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: LKML, Netdev, David Miller, Greg Kroah-Hartman, Andrew Lutomirski,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu9VamLBv5Ji5AtvugUrAYQYaotLFZZGA=Rt18JUDEaAMQ@mail.gmail.com>

On Thu, Sep 13, 2018 at 5:04 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> > The code still benefits from the review that's gone into OpenSSL. It's
> > not modified in ways that would affect the cryptographic operations
> > being done. It's modified to be suitable for kernel space.
>
> So could we please at least have those changes as a separate patch then?

I'll experiment with a couple ways of trying to communicate with
precision what's been changed from OpenSSL for the next round of
patches.

> > That's interesting. I'll bring this up with AndyP. FWIW, if you think
> > you have a real and compelling claim here, I'd be much more likely to
> > accept a different ChaCha20 implementation than I would be to accept a
> > different Poly1305 implementation. (It's a *lot* harder to screw up
> > ChaCha20 than it is to screw up Poly1305.)
>
> The question is really whether we want different implementations in
> the crypto API and in zinc.

Per earlier in this discussion, I've already authored patches that
replaces the crypto API's implementations with simple calls to Zinc,
so that code isn't duplicated. These will be in v4 and you can comment
on the approach then.

> You are completely missing my point. I am not particularly invested in
> the crypto API, and I share the concerns about its usability. That is
> why I want to make sure that your solution actually results in a net
> improvement for everybody, not just for WireGuard, in a maintainable
> way.

Right, likewise. I've put quite a bit of effort into separating Zinc
into Zinc and not into something part of WireGuard. The motivation for
doing so is a decent amount of call sites all around the kernel that
I'd like to gradually fix up.

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Ard Biesheuvel @ 2018-09-13 15:42 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAHmME9pcODNc2m26DH1cQY6R+grxYW5dvFpNDpz_GiugM3qufw@mail.gmail.com>

On 13 September 2018 at 16:32, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> On Thu, Sep 13, 2018 at 7:41 AM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> But one of the supposed selling points of this crypto library is that
>> it gives engineers who are frightened of crypto in general and the
>> crypto API in particular simple and easy to use crypto primitives
>> rather than having to jump through the crypto API's hoops.
>
> The goal is for engineers who want to specifically use algorithm X
> from within the kernel in a non-dynamic way to be able to then use
> algorithm X with a simple function call. The goal is not to open it up
> to people who have no idea what they're doing; for that a NaCL-like
> library with functions like "crypto_box_open" or something would fit
> the bill; but that's also not what we're trying to do here. Please
> don't confuse the design goals. The rest of your email is therefore a
> bit of a straw man; cut the rhetoric out.
>
>> A crypto library whose only encryption algorithm is a stream cipher
>> does *not* deliver on that promise, since it is only suitable for
>> cases where IVs are guaranteed not to be reused.
>
> False. We also offer XChaCha20Poly1305, which takes a massive nonce,
> suitable for random generation.
>
> If there became a useful case for AES-PMAC-SIV or even AES-GCM or
> something to that extent, then Zinc would add that as required. But
> we're not going to start adding random ciphers unless they're needed.
>

I'd prefer it if all the accelerated software implementations live in
the same place. But I do strongly prefer arch code to live in
arch/$arch, simply because of the maintenance scope for arch
developers and maintainers.

I think AES-GCM is a useful example here. I really like the SIMD token
abstraction a lot, but I would like to understand how this would work
in Zinc if you have
a) a generic implementation
b) perhaps an arch specific scalar implementation
c) a pure NEON implementation
d) an implementation using AES instructions but not the PMULL instructions
e) an implementation that uses AES and PMULL instructions.

On arch/arm64 we support code patching, on other arches it may be
different. We also support udev loading of modules depending on CPU
capabilities, which means only the implementation you are likely to
use will be loaded, and other modules are disregarded.

I am not asking you to implement this now. But I do want to understand
how these use cases fit into Zinc. And note that this has nothing to
do with the crypto API: this could simply be a synchronous
aes_gcm_encrypt() library function that maps to any of the above at
runtime depending on the platform's capabilities.

>> You yourself were
>> bitten by the clunkiness of the crypto API when attempting to use the
>> SHA26 code, right? So shouldn't we move that into this crypto library
>> as well?
>
> As stated in the initial commit, and in numerous other emails
> stretching back a year, yes, sha256 and other things in lib/ are going
> to be put into Zinc following the initial merge of Zinc. These changes
> will happen incrementally, like everything else that happens in the
> kernel. Sha256, in particular, is probably the first thing I'll port
> post-merge.
>

Excellent.

>> I think it is reasonable for WireGuard to standardize on
>> ChaCha20/Poly1305 only, although I have my concerns about the flag day
>> that will be required if this 'one true cipher' ever does turn out to
>> be compromised (either that, or we will have to go back in time and
>> add some kind of protocol versioning to existing deployments of
>> WireGuard)
>
> Those concerns are not valid and have already been addressed (to you,
> I believe) on this mailing list and elsewhere. WireGuard is versioned,
> hence there's no need to "add" versioning, and it is prepared to roll
> out new cryptography in a subsequent version should there be any
> issues. In other words, your concern is based on a misunderstanding of
> the protocol. If you have issues, however, with the design decisions
> of WireGuard, something that's been heavily discussed with members of
> the linux kernel community, networking community, cryptography
> community, and so forth, for the last 3 years, I invite you to bring
> them up on <wireguard@lists.zx2c4.com>.
>

I'd prefer to have the discussion here, if you don't mind.

IIUC clients and servers need to run the same version of the protocol.
So does it require a flag day to switch the world over to the new
version when the old one is deprecated?

>> And frankly, if the code were as good as the prose, we wouldn't be
>> having this discussion.
>
> Please cut out this rhetoric. That's an obviously unprovable
> statement, but it probably isn't true anyway. I wish you'd stick to
> technical concerns only, rather than what appears to be a desire to
> derail this by any means necessary.
>

I apologize if I hit a nerve there, that was not my intention.

I am not an 'entrenched crypto API guy that is out to get you'. I am a
core ARM developer that takes an interest in crypto, shares your
concern about the usability of the crypto API, and tries to ensure
that what we end up is maintainable and usable for everybody.

>> Zinc adds its own clunky ways to mix arch and
>> generic code, involving GCC -include command line arguments and
>> #ifdefs everywhere. My review comments on this were completely ignored
>> by Jason.
>
> No, they were not ignored. v2 cleaned up the #ifdefs. v4 has already
> cleaned up the makefile stuff and will be even cleaner. Good things
> await, don't worry.
>

You know what? If you're up for it, let's not wait until Plumbers, but
instead, let's collaborate off list to get this into shape.

^ permalink raw reply

* Re: [RFC PATCH iproute2-next] man: Add devlink health man page
From: Tobin C. Harding @ 2018-09-13 10:27 UTC (permalink / raw)
  To: Eran Ben Elisha
  Cc: netdev, Jiri Pirko, Andy Gospodarek, Michael Chan, Jakub Kicinski,
	Simon Horman, Alexander Duyck, Andrew Lunn, Florian Fainelli,
	Tal Alon, Ariel Almog
In-Reply-To: <1536826696-9413-2-git-send-email-eranbe@mellanox.com>

On Thu, Sep 13, 2018 at 11:18:16AM +0300, Eran Ben Elisha wrote:
> Add devlink-health man page. Devlink-health tool will control device
> health attributes, sensors, actions and logging.
> 
> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
> 
> -------------------------------------------------------
> Copy paste man output to here for easier review process of the RFC.
> 
> DEVLINK-HEALTH(8)                                                                                               Linux                                                                                              DEVLINK-HEALTH(8)
> 
> NAME
>        devlink-health - devlink health configuration
> 
> SYNOPSIS
>        devlink [ OPTIONS ] health  { COMMAND | help }
> 
>        OPTIONS := { -V[ersion] | -n[no-nice-names] }
> 
>        devlink health show [ DEV ] [ sensor NAME ]
> 
>        devlink health sensor set DEV name NAME [ action NAME { active | inactive } ]"
> 
>        devlink health action set DEV name NAME period PERIOD count COUNT fail { ignore | down }
> 
>        devlink health action reinit DEV name NAME
> 
>        devlink health help
> 
> DESCRIPTION
>        devlink-health tool allows user to configure the way driver treats unexpected status. The tool allows configuration of the sensors that can trigger health activity. Set for each sensor the follow up operations, such as,
>        reset and dump of info. In addition, set the health activity termination action.
> 
>    devlink health show - Display devlink health sensors and actions attributes
>        DEV - Specifies the devlink device to show.  If this argument is omitted, all devices are listed.
> 
>            Format is:
>              BUS_NAME/BUS_ADDRESS
> 
>        sensor NAME - Specifies the devlink sensor to show.
> 

Perhaps the commands should include the optional arguments so when
reading the description one doesn't have to scroll to the top of the
page all the time

e.g
     devlink health show [ DEV ] [ sensor NAME ] - Display devlink health sensors and actions attributes

>    devlink health sensor set - sets devlink health sensor attributes
>        DEV    Specifies the devlink device to show.

	 		      	      	     	set

>        name NAME
>               Name of the sensor to set.
> 
>        action NAME { active | inactive }
>                   Specify which actions to activate and which to deactivate once a sensor was triggered. actions can be dump, reset, etc.
> 
>    devlink health action set - sets devlink action attributes
>        DEV    Specifies the devlink device to set.
> 
>        name NAME
>               Specifies the devlink action to set.

This is a little unclear to me?

>        period PERIOD
>               The period on which we limit the amount of performed actions, measured in seconds.
> 
>        count COUNT
>               The maximum amount of actions performed in a limit time frame.

Perhaps		    	    	      	      
                The maximum number of actions performed in a limited time frame.

>        fail   { ignore | down }
>                   Specify the behavior once count limit was reached.
> 
>                   ignore - Ignore errors without execution of any action.
> 
>                   down - Driver will remain in nonoperational state.
> 
>    devlink health action reinit - reset devlink action attributes (period, count, fail, etc)
>        DEV    Specifies the devlink device to set.
> 
>        name NAME
>               Specifies the devlink action to set.

Perhaps s/set/reinitialise/g for the above two descriptions.

Hope this helps,
Tobin.

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Andy Lutomirski @ 2018-09-13 15:26 UTC (permalink / raw)
  To: Milan Broz
  Cc: Andy Lutomirski, Ard Biesheuvel, Jason A. Donenfeld, LKML, Netdev,
	David Miller, Greg Kroah-Hartman, Samuel Neves,
	Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <c5855aa4-1561-8387-aebb-0a7587ac6d4f@gmail.com>


> On Sep 12, 2018, at 11:39 PM, Milan Broz <gmazyland@gmail.com> wrote:
> 
>> On 13/09/18 01:45, Andy Lutomirski wrote:
>> On Wed, Sep 12, 2018 at 3:56 PM, Ard Biesheuvel
> ... 
>> b) Crypto that is used dynamically.  This includes dm-crypt
>> (aes-xts-plain64, aes-cbc-essiv, etc), all the ALG_IF interfaces, a
>> lot of IPSEC stuff, possibly KCM, and probably many more.  These will
>> get comparatively little benefit from being converted to a zinc-like
>> interface.  For some of these cases, it wouldn't make any sense at all
>> to convert them.  Certainly the ones that do async hardware crypto
>> using DMA engines will never look at all like zinc, even under the
>> hood.
> 
> Please note, that dm-crypt now uses not only block ciphers and modes,
> but also authenticated encryption and hashes (for ESSIV and HMAC
> in authenticated composed modes) and RNG (for random IV).
> We use crypto API, including async variants (I hope correctly :)

Right. And all this is why I don’t think dm-crypt should use zinc, at least not any time soon.

^ permalink raw reply

* Re: [PATCH] xen/netfront: don't bug in case of too many frags
From: David Miller @ 2018-09-13 15:22 UTC (permalink / raw)
  To: jgross; +Cc: linux-kernel, netdev, xen-devel, boris.ostrovsky, stable
In-Reply-To: <20180911070448.3958-1-jgross@suse.com>

From: Juergen Gross <jgross@suse.com>
Date: Tue, 11 Sep 2018 09:04:48 +0200

> Commit 57f230ab04d291 ("xen/netfront: raise max number of slots in
> xennet_get_responses()") raised the max number of allowed slots by one.
> This seems to be problematic in some configurations with netback using
> a larger MAX_SKB_FRAGS value (e.g. old Linux kernel with MAX_SKB_FRAGS
> defined as 18 instead of nowadays 17).
> 
> Instead of BUG_ON() in this case just fall back to retransmission.
> 
> Fixes: 57f230ab04d291 ("xen/netfront: raise max number of slots in xennet_get_responses()")
> Cc: stable@vger.kernel.org
> Signed-off-by: Juergen Gross <jgross@suse.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next V2] virtio_net: ethtool tx napi configuration
From: Willem de Bruijn @ 2018-09-13 15:20 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, David Miller, virtualization,
	Network Development, LKML, Willem de Bruijn
In-Reply-To: <20180913053545.18585-1-jasowang@redhat.com>

On Thu, Sep 13, 2018 at 1:40 AM Jason Wang <jasowang@redhat.com> wrote:
>
> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> Interrupt moderation is currently not supported, so these accept and
> display the default settings of 0 usec and 1 frame.
>
> Toggle tx napi through a bit in tx-frames. So as to not interfere
> with possible future interrupt moderation, value 1 means tx napi while
> value 0 means not.
>
> To properly synchronize with the data path, tx napi is disabled and
> tx lock is held when changing the value of napi weight. And two more
> places that can access tx napi weight:
>
> - speculative tx polling in rx napi, we can leave it as is since it
>   not a must for correctness.
> - skb_xmit_done(), one more check of napi weight is added before
>   trying to enable tx to avoid tx to be disabled forever if napi is
>   disabled after skb_xmit_done() but before the napi
>
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V1:
> - try to synchronize with datapath to allow changing mode when
>   interface is up.
> - use tx-frames 0 as to disable tx napi while tx-frames 1 to enable tx napi
> ---
>  drivers/net/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..6e70864f5899 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
>
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>
> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> +

This is no longer needed

>  static const unsigned long guest_offloads[] = {
>         VIRTIO_NET_F_GUEST_TSO4,
>         VIRTIO_NET_F_GUEST_TSO6,
> @@ -1444,7 +1446,10 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>
>         virtqueue_napi_complete(napi, sq->vq, 0);
>
> -       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +       /* Check napi.weight to avoid tx stall since it could be set
> +        * to zero by ethtool after skb_xmit_done().
> +        */
> +       if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS || !sq->napi.weight)
>                 netif_tx_wake_queue(txq);

I see. This assumes that the napi handler will always be called on
conversion from napi to no-napi mode.

That is safe to assume because if it isn't called (and will not call
netif_tx_wake_queue) that implies that napi was not scheduled, and
thus the tx interrupt was not suppressed and thus there was no tx
completion work to be scheduled?

>
>         return 0;
> @@ -2181,6 +2186,61 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>         return 0;
>  }
>
> +static int virtnet_set_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_SCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       int i, napi_weight;
> +
> +       if (ec->tx_max_coalesced_frames > 1)
> +               return -EINVAL;
> +
> +       ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
> +       napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> +
> +       /* disallow changes to fields not explicitly tested above */
> +       if (memcmp(ec, &ec_default, sizeof(ec_default)))
> +               return -EINVAL;
> +
> +       if (napi_weight ^ vi->sq[0].napi.weight) {
> +               for (i = 0; i < vi->max_queue_pairs; i++) {
> +                       struct netdev_queue *txq =
> +                              netdev_get_tx_queue(vi->dev, i);
> +
> +                       virtnet_napi_tx_disable(&vi->sq[i].napi);
> +                       __netif_tx_lock_bh(txq);
> +                       vi->sq[i].napi.weight = napi_weight;
> +                       __netif_tx_unlock_bh(txq);
> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +                                              &vi->sq[i].napi);
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int virtnet_get_coalesce(struct net_device *dev,
> +                               struct ethtool_coalesce *ec)
> +{
> +       struct ethtool_coalesce ec_default = {
> +               .cmd = ETHTOOL_GCOALESCE,
> +               .rx_max_coalesced_frames = 1,
> +               .tx_max_coalesced_frames = 0,

no need to explicitly initialize to 0 (unless you did this for
documentation purposes, which is fine).

> +       };
> +       struct virtnet_info *vi = netdev_priv(dev);
> +
> +       memcpy(ec, &ec_default, sizeof(ec_default));
> +
> +       if (vi->sq[0].napi.weight)
> +               ec->tx_max_coalesced_frames = 1;
> +
> +       return 0;
> +}
> +
>  static void virtnet_init_settings(struct net_device *dev)
>  {
>         struct virtnet_info *vi = netdev_priv(dev);
> @@ -2219,6 +2279,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>         .get_ts_info = ethtool_op_get_ts_info,
>         .get_link_ksettings = virtnet_get_link_ksettings,
>         .set_link_ksettings = virtnet_set_link_ksettings,
> +       .set_coalesce = virtnet_set_coalesce,
> +       .get_coalesce = virtnet_get_coalesce,
>  };
>
>  static void virtnet_freeze_down(struct virtio_device *vdev)
> --
> 2.17.1
>

^ permalink raw reply

* Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER
From: Sowmini Varadhan @ 2018-09-13 10:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tushar Dave, ast, daniel, davem, santosh.shilimkar,
	jakub.kicinski, quentin.monnet, jiong.wang, sandipan,
	john.fastabend, kafai, rdna, yhs, netdev, rds-devel
In-Reply-To: <20180913020757.geqe55gnsexv6icp@ast-mbp>

On (09/12/18 19:07), Alexei Starovoitov wrote:
> 
> I didn't know that. The way I understand your statement that
> this new program type, new sg logic, and all the complexity
> are only applicable to RDMA capable hw and RDS.

I dont know if you have been following the RFC series at all 
(and DanielB/JohnF feedback to it) but that is not what the patch 
set is about.

To repeat a summary of the original problem statement:

RDS (hardly a "niche" driver, let's please not get carried away 
with strong assertions based on incomplete understanding), 
is an example of a driver that happens to pass up packets
as both scatterlist and sk_buffs to the ULPs. 

The scatterlist comes from IB, the sk_buffs come from the ethernet
drivers. At the moment, the only way to build firewalls for
this is to convert scatterlist to skb and use  either netfilter
or eBPF on the skb. What Tushar is adding is support to use eBPF
on the scatterlist itself, so that you dont have to do this
inefficient scatterlist->skb conversion.

> In such case, I think, such BPF extensions do not belong
> in the upstream kernel. I don't want BPF to support niche technologies,

> since the maintenance overhead makes it prohibitive long term.

After I sent the message, I noticed that selftests/bpf has
some tests using veth/netns. I think Tushar should be able to write
tests for the rds-tcp path (and thus test the eBPF infrastructure, 
which seems to be what you are worried about)

Does that address your concern?

--Sowmini

^ permalink raw reply

* Re: [PATCH 1/1] ipv6: Add sockopt IPV6_MULTICAST_ALL analogue to IP_MULTICAST_ALL
From: David Miller @ 2018-09-13 15:17 UTC (permalink / raw)
  To: nautsch2
  Cc: netdev, linux-kernel, kuznet, yoshfuji, gregkh, ek, tglx, maze,
	shli, kstewart, pombredanne
In-Reply-To: <20180910082715.11506-2-nautsch2@gmail.com>

From: Andre Naujoks <nautsch2@gmail.com>
Date: Mon, 10 Sep 2018 10:27:15 +0200

> The socket option will be enabled by default to ensure current behaviour
> is not changed. This is the same for the IPv4 version.
> 
> A socket bound to in6addr_any and a specific port will receive all traffic
> on that port. Analogue to IP_MULTICAST_ALL, disable this behaviour, if
> one or more multicast groups were joined (using said socket) and only
> pass on multicast traffic from groups, which were explicitly joined via
> this socket.
> 
> Without this option disabled a socket (system even) joined to multiple
> multicast groups is very hard to get right. Filtering by destination
> address has to take place in user space to avoid receiving multicast
> traffic from other multicast groups, which might have traffic on the same
> port.
> 
> The extension of the IP_MULTICAST_ALL socketoption to just apply to ipv6,
> too, is not done to avoid changing the behaviour of current applications.
> 
> Signed-off-by: Andre Naujoks <nautsch2@gmail.com>
> Acked-By: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>

Applied to net-next, thank you.

^ permalink raw reply

* Re: [PATCH net-next RFC] virtio_net: ethtool tx napi configuration
From: Tobin C. Harding @ 2018-09-13 10:04 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, jasowang, mst, f.fainelli, Willem de Bruijn
In-Reply-To: <20180912232911.218610-1-willemdebruijn.kernel@gmail.com>

On Wed, Sep 12, 2018 at 07:29:11PM -0400, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
> 
> Implement ethtool .set_priv_flags and .get_priv_flags handlers
> and use ethtool private flags to toggle transmit napi:
> 
>   ethtool --set-priv-flags eth0 tx-napi on
>   ethtool --show-priv-flags eth0
> 
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  drivers/net/virtio_net.c | 49 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 49 insertions(+)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..9ca7e0a0f0d9 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -73,6 +73,10 @@ static const unsigned long guest_offloads[] = {
>  	VIRTIO_NET_F_GUEST_UFO
>  };
>  
> +static const char virtnet_ethtool_priv_flags[][ETH_GSTRING_LEN] = {
> +	"tx-napi",
> +};
> +
>  struct virtnet_stat_desc {
>  	char desc[ETH_GSTRING_LEN];
>  	size_t offset;
> @@ -2059,6 +2063,9 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>  			}
>  		}
>  		break;
> +	case ETH_SS_PRIV_FLAGS:
> +		memcpy(data, virtnet_ethtool_priv_flags,
> +		       sizeof(virtnet_ethtool_priv_flags));
>  	}
>  }
>  
> @@ -2070,6 +2077,9 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>  	case ETH_SS_STATS:
>  		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
>  					       VIRTNET_SQ_STATS_LEN);
> +	case ETH_SS_PRIV_FLAGS:
> +		return ARRAY_SIZE(virtnet_ethtool_priv_flags);
> +
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> @@ -2181,6 +2191,43 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>  	return 0;
>  }
>  
> +static int virtnet_set_priv_flags(struct net_device *dev, u32 priv_flags)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i, napi_weight;
> +
> +	napi_weight = priv_flags & 0x1 ? NAPI_POLL_WEIGHT : 0;
> +
> +	if (napi_weight ^ vi->sq[0].napi.weight) {
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			struct netdev_queue *txq =
> +				netdev_get_tx_queue(vi->dev, i);
> +
> +			virtnet_napi_tx_disable(&vi->sq[i].napi);
> +			__netif_tx_lock_bh(txq);
> +			vi->sq[i].napi.weight = napi_weight;
> +			if (!napi_weight)
> +				virtqueue_enable_cb(vi->sq[i].vq);
> +			__netif_tx_unlock_bh(txq);
> +			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +					       &vi->sq[i].napi);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static u32 virtnet_get_priv_flags(struct net_device *dev)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int priv_flags = 0;
> +
> +	if (vi->sq[0].napi.weight)
> +		priv_flags |= 0x1;
> +
> +	return priv_flags;
> +}

Why the use of priv_flags here?  Is there some reason that we don't want
to use the more simple

    static u32 virtnet_get_priv_flags(struct net_device *dev)
    {
            struct virtnet_info *vi = netdev_priv(dev);
    
    	    if (vi->sq[0].napi.weight)
       	            return 1;
    
    	    return 0;
    }


thanks,
Tobin.

^ permalink raw reply

* [PATCH v3 27/30] ipv4: frags: precedence bug in ip_expire()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Dan Carpenter
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Dan Carpenter <dan.carpenter@oracle.com>

We accidentally removed the parentheses here, but they are required
because '!' has higher precedence than '&'.

Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 70837ffe3085c9a91488b52ca13ac84424da1042)
---
 net/ipv4/ip_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 0e8f8de77e71..7cb7ed761d8c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -154,7 +154,7 @@ static void ip_expire(struct timer_list *t)
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-	if (!qp->q.flags & INET_FRAG_FIRST_IN)
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
 		goto out;
 
 	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 25/30] net: add rb_to_skb() and other rb tree helpers
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 18a4c0eab2623cc95be98a1e6af1ad18e7695977)
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 758084b434c8..2837e55df03e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3169,6 +3169,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 
 #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3183,6 +3189,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fbbeda647774..0567edb76522 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -458,17 +458,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bdabd748f4bc..991f382afc1b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4372,7 +4372,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4440,7 +4440,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4503,7 +4503,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4548,9 +4548,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4565,7 +4563,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4749,7 +4747,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4778,7 +4776,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4898,19 +4896,12 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
@@ -4918,7 +4909,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4974,7 +4965,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
 		goal -= rb_to_skb(node)->truesize;
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		if (!prev || goal <= 0) {
 			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -4984,7 +4975,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		}
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8c8df75dbead..2a2ab6bfe5d8 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -149,12 +149,6 @@ struct netem_skb_cb {
 	ktime_t		tstamp_save;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -365,7 +359,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p;
 
 	while ((p = rb_first(&q->t_root))) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		rb_erase(p, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -618,7 +612,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 22/30] net: modify skb_rbtree_purge to return the truesize of all purged skbs.
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Peter Oskolkov, Florian Westphal
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Peter Oskolkov <posk@google.com>

Tested: see the next patch is the series.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 385114dec8a49b5e5945e77ba7de6356106713f4)
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f4749678b7ee..9c8457375aee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2581,7 +2581,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-void skb_rbtree_purge(struct rb_root *root);
+unsigned int skb_rbtree_purge(struct rb_root *root);
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c7c5f05f2af1..8fd690def5c1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2842,23 +2842,27 @@ EXPORT_SYMBOL(skb_queue_purge);
 /**
  *	skb_rbtree_purge - empty a skb rbtree
  *	@root: root of the rbtree to empty
+ *	Return value: the sum of truesizes of all purged skbs.
  *
  *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
  *	the list and one reference dropped. This function does not take
  *	any lock. Synchronization should be handled by the caller (e.g., TCP
  *	out-of-order queue is protected by the socket lock).
  */
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
 {
 	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
 
 	while (p) {
 		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, root);
+		sum += skb->truesize;
 		kfree_skb(skb);
 	}
+	return sum;
 }
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 21/30] net: speed up skb_rbtree_purge()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

As measured in my prior patch ("sch_netem: faster rb tree removal"),
rbtree_postorder_for_each_entry_safe() is nice looking but much slower
than using rb_next() directly, except when tree is small enough
to fit in CPU caches (then the cost is the same)

Also note that there is not even an increase of text size :
$ size net/core/skbuff.o.before net/core/skbuff.o
   text	   data	    bss	    dec	    hex	filename
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o.before
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o

From: Eric Dumazet <edumazet@google.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 7c90584c66cc4b033a3b684b0e0950f79e7b7166)
---
 net/core/skbuff.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2e5eeba97de9..c7c5f05f2af1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2850,12 +2850,15 @@ EXPORT_SYMBOL(skb_queue_purge);
  */
 void skb_rbtree_purge(struct rb_root *root)
 {
-	struct sk_buff *skb, *next;
+	struct rb_node *p = rb_first(root);
 
-	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
-		kfree_skb(skb);
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
-	*root = RB_ROOT;
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
+		kfree_skb(skb);
+	}
 }
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 16/30] rhashtable: reorganize struct rhashtable layout
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

While under frags DDOS I noticed unfortunate false sharing between
@nelems and @params.automatic_shrinking

Move @nelems at the end of struct rhashtable so that first cache line
is shared between all cpus, because almost never dirtied.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit e5d672a0780d9e7118caad4c171ec88b8299398d)
---
 include/linux/rhashtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7fd514f36e74..a4be6388a980 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -152,25 +152,25 @@ struct rhashtable_params {
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
- * @nelems: Number of elements in table
  * @key_len: Key length for hashfn
- * @p: Configuration parameters
  * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
  * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	atomic_t			nelems;
 	unsigned int			key_len;
-	struct rhashtable_params	p;
 	unsigned int			max_elems;
+	struct rhashtable_params	p;
 	bool				rhlist;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
+	atomic_t			nelems;
 };
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 12/30] inet: frags: remove inet_frag_maybe_warn_overflow()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 2d44ed22e607f9a285b049de2263e3840673a260)
---
 include/net/inet_frag.h                 |  2 --
 net/ieee802154/6lowpan/reassembly.c     |  5 ++---
 net/ipv4/inet_fragment.c                | 11 -----------
 net/ipv4/ip_fragment.c                  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c                   |  5 ++---
 6 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 0e8e159d88f7..95e353e3305b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf);
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 0fa0121f85d4..1aec71a3f904 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ebb8f411e0db..c9e35b81d093 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit. Dropping fragment.\n";
-
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1222aee3e5ee..38cbf56bb48e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->ipv4.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 8b12431ae296..54ce1d2a9a9d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->nf_frag.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 70acad126d04..2a77fda5e3bc 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 		key.iif = 0;
 
 	q = inet_frag_find(&net->ipv6.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 11/30] inet: frags: get rif of inet_frag_evicting()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 399d1404be660d355192ff4df5ccc3f4159ec1e4)
---
 include/net/inet_frag.h |  5 ----
 net/ipv4/ip_fragment.c  | 65 ++++++++++++++++++++---------------------
 net/ipv6/reassembly.c   |  4 ---
 3 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 4b5449df0aad..0e8e159d88f7 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 		inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-	return false;
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 94451fad9994..1222aee3e5ee 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
-	struct ipq *qp;
+	struct sk_buff *clone, *head;
+	const struct iphdr *iph;
 	struct net *net;
+	struct ipq *qp;
+	int err;
 
 	qp = container_of(frag, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t)
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!inet_frag_evicting(&qp->q)) {
-		struct sk_buff *clone, *head = qp->q.fragments;
-		const struct iphdr *iph;
-		int err;
+	head = qp->q.fragments;
 
-		__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-			goto out;
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
-		head->dev = dev_get_by_index_rcu(net, qp->iif);
-		if (!head->dev)
-			goto out;
+	head->dev = dev_get_by_index_rcu(net, qp->iif);
+	if (!head->dev)
+		goto out;
 
 
-		/* skb has no dst, perform route lookup again */
-		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+	/* skb has no dst, perform route lookup again */
+	iph = ip_hdr(head);
+	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
 					   iph->tos, head->dev);
-		if (err)
-			goto out;
+	if (err)
+		goto out;
 
-		/* Only an end host needs to send an ICMP
-		 * "Fragment Reassembly Timeout" message, per RFC792.
-		 */
-		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-		    (skb_rtable(head)->rt_type != RTN_LOCAL))
-			goto out;
-
-		clone = skb_clone(head, GFP_ATOMIC);
-
-		/* Send an ICMP "Fragment Reassembly Timeout" message. */
-		if (clone) {
-			spin_unlock(&qp->q.lock);
-			icmp_send(clone, ICMP_TIME_EXCEEDED,
-				  ICMP_EXC_FRAGTIME, 0);
-			consume_skb(clone);
-			goto out_rcu_unlock;
-		}
+	/* Only an end host needs to send an ICMP
+	 * "Fragment Reassembly Timeout" message, per RFC792.
+	 */
+	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+	    (skb_rtable(head)->rt_type != RTN_LOCAL))
+		goto out;
+
+	clone = skb_clone(head, GFP_ATOMIC);
+
+	/* Send an ICMP "Fragment Reassembly Timeout" message. */
+	if (clone) {
+		spin_unlock(&qp->q.lock);
+		icmp_send(clone, ICMP_TIME_EXCEEDED,
+			  ICMP_EXC_FRAGTIME, 0);
+		consume_skb(clone);
+		goto out_rcu_unlock;
 	}
 out:
 	spin_unlock(&qp->q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3fc853e4492a..70acad126d04 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out_rcu_unlock;
 
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-
-	if (inet_frag_evicting(&fq->q))
-		goto out_rcu_unlock;
-
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 08/30] rhashtable: add schedule points
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

Rehashing and destroying large hash table takes a lot of time,
and happens in process context. It is safe to add cond_resched()
in rhashtable_rehash_table() and rhashtable_free_and_destroy()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit ae6da1f503abb5a5081f9f6c4a6881de97830f3e)
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 39215c724fc7..cebbcec877d7 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -364,6 +364,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 		err = rhashtable_rehash_chain(ht, old_hash);
 		if (err)
 			return err;
+		cond_resched();
 	}
 
 	/* Publish the new table pointer. */
@@ -1073,6 +1074,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
+			cond_resched();
 			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 07/30] ipv6: export ip6 fragments sysctl to unprivileged users
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Nikolay Borisov
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

IPv4 was changed in commit 52a773d645e9 ("net: Export ip fragment
sysctl to unprivileged users")

The only sysctl that is not per-netns is not used :
ip6frag_secret_interval

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 18dcbe12fe9fca0ab825f7eff993060525ac2503)
---
 net/ipv6/reassembly.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 42b6b2ba447a..f0071b113a92 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -649,10 +649,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 		table[1].data = &net->ipv6.frags.low_thresh;
 		table[1].extra2 = &net->ipv6.frags.high_thresh;
 		table[2].data = &net->ipv6.frags.timeout;
-
-		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
-			table[0].procname = NULL;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv6", table);
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 02/30] inet: frags: add a pointer to struct netns_frags
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 093ba72914b696521e4885756a68a3332782c8de)
---
 include/net/inet_frag.h                 | 11 ++++++-----
 include/net/ipv6.h                      |  3 +--
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++------
 net/ipv4/inet_fragment.c                | 17 ++++++++++-------
 net/ipv4/ip_fragment.c                  |  9 +++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++-------
 net/ipv6/reassembly.c                   | 20 ++++++++++----------
 7 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 2ad894e446ac..fd338293a095 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
 	int			high_thresh;
 	int			low_thresh;
 	int			max_dist;
+	struct inet_frags	*f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf)
 	atomic_set(&nf->mem, 0);
 	return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
 	if (refcount_dec_and_test(&q->refcnt))
-		inet_frag_destroy(q, f);
+		inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f280c61e019a..ff8407b19d05 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -560,8 +560,7 @@ struct frag_queue {
 	u8			ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 9757ce6c077a..9ccb8458b5c3 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -93,10 +93,10 @@ static void lowpan_frag_expire(unsigned long data)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, &lowpan_frags);
+	inet_frag_put(&fq->q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -229,7 +229,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int sum_truesize;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 
 	/* Make the one we just received the head. */
 	if (prev) {
@@ -437,7 +437,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q, &lowpan_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -585,13 +585,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+		inet_frags_exit_net(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -601,7 +602,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	inet_frags_exit_net(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ba4454ecdf0f..4b44f973c37f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
+	struct inet_frags *f =nf->f;
 	unsigned int seq;
 	int i;
 
@@ -264,33 +265,34 @@ __acquires(hb->chain_lock)
 	return hb;
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static inline void fq_unlink(struct inet_frag_queue *fq)
 {
 	struct inet_frag_bucket *hb;
 
-	hb = get_frag_bucket_locked(fq, f);
+	hb = get_frag_bucket_locked(fq, fq->net->f);
 	hlist_del(&fq->list);
 	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
+		fq_unlink(fq);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
+	struct inet_frags *f;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	/* Release all fragment data. */
 	fp = q->fragments;
 	nf = q->net;
+	f = nf->f;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 			refcount_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
 			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
+			inet_frag_put(qp_in);
 			return qp;
 		}
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 916def2e2afe..c32718b00761 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q, &ip4_frags);
+	inet_frag_put(&ipq->q);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
@@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq)
  */
 static void ipq_kill(struct ipq *ipq)
 {
-	inet_frag_kill(&ipq->q, &ip4_frags);
+	inet_frag_kill(&ipq->q);
 }
 
 static bool frag_expire_skip_icmp(u32 user)
@@ -876,20 +876,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
 	net->ipv4.frags.max_dist = 64;
+	net->ipv4.frags.f = &ip4_frags;
 
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+		inet_frags_exit_net(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	inet_frags_exit_net(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index afa9ea76155d..7ea2b4490672 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -177,7 +177,7 @@ static void nf_ct_frag6_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq, &nf_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
@@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			inet_frag_kill(&fq->q, &nf_frags);
+			inet_frag_kill(&fq->q);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {
@@ -356,7 +356,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 err:
 	return -EINVAL;
 }
@@ -378,7 +378,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	int    payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@@ -623,7 +623,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-	inet_frag_put(&fq->q, &nf_frags);
+	inet_frag_put(&fq->q);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
@@ -635,19 +635,21 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.frags.f = &nf_frags;
+
 	res = inet_frags_init_net(&net->nf_frag.frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+		inet_frags_exit_net(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	inet_frags_exit_net(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 38bf38a9717f..26f737c3fc7b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags)
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
 
@@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, frags);
+	inet_frag_kill(&fq->q);
 
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -166,7 +165,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	rcu_read_unlock();
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, frags);
+	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
@@ -178,7 +177,7 @@ static void ip6_frag_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
@@ -363,7 +362,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 err:
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
@@ -390,7 +389,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	int sum_truesize;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -568,7 +567,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q, &ip6_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -719,6 +718,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.frags.f = &ip6_frags;
 
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
@@ -726,14 +726,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+		inet_frags_exit_net(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	inet_frags_exit_net(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
2.18.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox