From mboxrd@z Thu Jan 1 00:00:00 1970 From: Eyal Birger Subject: xfrm, ip tunnel: non released device reference upon device unregistration Date: Sun, 4 Feb 2018 13:21:18 +0200 Message-ID: <20180204132118.6fef9bf0@jimi> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au, davem@davemloft.net, shmulik@metanetworks.com To: steffen.klassert@secunet.com Return-path: Received: from mail-wm0-f65.google.com ([74.125.82.65]:34619 "EHLO mail-wm0-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750818AbeBDLV1 (ORCPT ); Sun, 4 Feb 2018 06:21:27 -0500 Received: by mail-wm0-f65.google.com with SMTP id j21-v6so9131634wmh.1 for ; Sun, 04 Feb 2018 03:21:26 -0800 (PST) Sender: netdev-owner@vger.kernel.org List-ID: Hi, We've encountered a non released device reference upon device unregistration which seems to stem from xfrm policy code. The setup includes: - an underlay device (e.g. eth0) using IPv4 - an xfrm IPv6 over IPv4 tunnel routed via the underlay device - an ipip6 tunnel over the xfrm IPv6 tunnel When tearing down the underlay device, after traffic had passed via the ipi= p6 tunnel, log messages of the following form are observed: unregister_netdevice: waiting for eth0 to become free. Usage count =3D 2 The below synthetic script reproduces this consistently on a fresh ubuntu vm running net-next v4.15-6066-ge9522a5: --------------------------------------------------------- #!/bin/bash ipsec_underlay_dst=3D192.168.6.1 ipsec_underlay_src=3D192.168.5.2 ipv6_pfx=3D1234 local_ipv6_addr=3D"$ipv6_pfx::1" remote_ipv6_addr=3D"$ipv6_pfx::2" # create dummy ipsec underlay ip l add dev dummy1 type dummy ip l set dev dummy1 up ip r add "$ipsec_underlay_dst/32" dev dummy1 ip -6 r add "$ipv6_pfx::/16" dev dummy1 ip a add dev dummy1 "$local_ipv6_addr/128" ip a add dev dummy1 "$ipsec_underlay_src/24" # add xfrm policy and state ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src= "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tun= nel ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp sp= i 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628= 962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad744842= 0e encap espinudp 4500 4500 0.0.0.0 # add 4o6 tunnel ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$rem= ote_ipv6_addr" ip l set dev tnl46 up ip r add 10.64.0.0/10 dev tnl46=20 # pass traffic so route is cached ping -w 1 -c 1 10.64.0.1 # remove dummy underlay ip l del dummy1 --------------------------------------------------------- Analysis: ip6_tunnel holds a dst_cache which caches its underlay dst objects. When devices are unregistered, non-xfrm dst objects are invlidated by their original creators (ipv4/ipv6/...) and thus are wiped from dst_cache. xfrm created routes otoh are not tracked by xfrm, and are not invalidated u= pon device unregistration, thus hold the device upon unregistration. The following rough sketch patch illustrates an approach overcoming this issue: --------------------------------------------------------- =46rom e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 4 Feb 2018 13:08:02 +0200 Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device unregistration --- include/net/xfrm.h | 10 ++----- net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++= ++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7d20776..c1c8493 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -994,6 +994,8 @@ struct xfrm_dst { u32 child_mtu_cached; u32 route_cookie; u32 path_cookie; + struct list_head xfrm_dst_gc; + struct xfrm_dst_list *xfrm_dst_gc_list; }; =20 static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_ds= t *xdst, struct dst_entry *c xdst->child =3D child; } =20 -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) -{ - xfrm_pols_put(xdst->pols, xdst->num_pols); - dst_release(xdst->route); - if (likely(xdst->u.dst.xfrm)) - xfrm_state_put(xdst->u.dst.xfrm); -} +void xfrm_dst_destroy(struct xfrm_dst *xdst); #endif =20 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078..d446641 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, = const struct flowi *fl) (fl6->flowi6_oif =3D=3D sel->ifindex || !sel->ifindex); } =20 +struct xfrm_dst_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list); + +static void xfrm_add_dst_list(struct xfrm_dst *xdst) +{ + struct xfrm_dst_list *xl =3D raw_cpu_ptr(&xfrm_dst_gc_list); + + xdst->xfrm_dst_gc_list =3D xl; + + spin_lock_bh(&xl->lock); + list_add_tail(&xdst->xfrm_dst_gc, &xl->head); + spin_unlock_bh(&xl->lock); +} + +void xfrm_dst_destroy(struct xfrm_dst *xdst) +{ + xfrm_pols_put(xdst->pols, xdst->num_pols); + dst_release(xdst->route); + if (likely(xdst->u.dst.xfrm)) + xfrm_state_put(xdst->u.dst.xfrm); + if (!list_empty(&xdst->xfrm_dst_gc)) { + struct xfrm_dst_list *xl =3D xdst->xfrm_dst_gc_list; + + spin_lock_bh(&xl->lock); + list_del(&xdst->xfrm_dst_gc); + spin_unlock_bh(&xl->lock); + } +} +EXPORT_SYMBOL_GPL(xfrm_dst_destroy); + +void xfrm_flush_dev(struct net_device *dev) +{ + struct xfrm_dst *xdst; + int cpu; + + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl =3D &per_cpu(xfrm_dst_gc_list, cpu); + + spin_lock_bh(&xl->lock); + list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) { + if (xdst->u.dst.dev !=3D dev) + continue; + dst_dev_put(&xdst->u.dst); + } + spin_unlock_bh(&xl->lock); + } +} + bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flo= wi *fl, unsigned short family) { @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xf= rm_policy *policy, } =20 bundle[i] =3D xdst; + xfrm_add_dst_list(xdst); + if (!xdst_prev) xdst0 =3D xdst; else @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_= net_ops =3D { .exit =3D xfrm_net_exit, }; =20 +static int xfrm_netdev_event(struct notifier_block *this, unsigned long ev= ent, void *ptr) +{ + struct net_device *dev =3D netdev_notifier_info_to_dev(ptr); + + if (event =3D=3D NETDEV_UNREGISTER) + xfrm_flush_dev(dev); + return NOTIFY_DONE; +} + +static struct notifier_block xfrm_netdev_notifier =3D { + .notifier_call =3D xfrm_netdev_event, +}; + void __init xfrm_init(void) { + int cpu; int i; =20 xfrm_pcpu_work =3D kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), @@ -2998,6 +3066,13 @@ void __init xfrm_init(void) register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl =3D &per_cpu(xfrm_dst_gc_list, cpu); + + INIT_LIST_HEAD(&xl->head); + spin_lock_init(&xl->lock); + } + register_netdevice_notifier(&xfrm_netdev_notifier); } =20 #ifdef CONFIG_AUDITSYSCALL --=20 2.7.4 --------------------------------------------------------- This approach has the unfortunate side effects of adding a spin lock for the tracked list, as well as increasing struct xfrm_dst. Any thoughts on how to best approach this within xfrm would be most welcome= d. Thanks, Eyal.