[PATCH] sctp: fix ASCONF list handling

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] sctp: fix ASCONF list handling
@ 2015-05-28  0:52 mleitner
  2015-05-28 10:15 ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: mleitner @ 2015-05-28  0:52 UTC (permalink / raw)
  To: netdev, linux-sctp
  Cc: Daniel Borkmann, Neil Horman, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by adding a spinlock to protect
against multiple writers and converts the list to be protected by RCU
too, so that we don't have a lock inverstion issue at
sctp_addr_wq_timeout_handler().

And as this list now uses RCU, we cannot do such backup and restore
while copying descendant data anymore as readers may be traversing the
list meanwhile. We fix this by simply ignoring/not copying those fields,
placed at the end of struct sctp_sock, so we can just ignore it together
with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
so we don't clutter inet_sk_copy_descendant() with SCTP info.

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 include/net/netns/sctp.h   |  6 +++++-
 include/net/sctp/structs.h |  2 ++
 net/sctp/protocol.c        |  6 +++++-
 net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -30,12 +30,15 @@ struct netns_sctp {
 	struct list_head local_addr_list;
 	struct list_head addr_waitq;
 	struct timer_list addr_wq_timer;
-	struct list_head auto_asconf_splist;
+	struct list_head __rcu auto_asconf_splist;
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
 	spinlock_t local_addr_lock;
 
+	/* Lock that protects the auto_asconf_splist writers */
+	spinlock_t auto_asconf_lock;
+
 	/* RFC2960 Section 14. Suggested SCTP Protocol Parameter Values
 	 *
 	 * The following protocol parameters are RECOMMENDED:
@@ -129,6 +132,7 @@ struct netns_sctp {
 
 	/* Threshold for autoclose timeout, in seconds. */
 	unsigned long max_autoclose;
+
 };
 
 #endif /* __NETNS_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..459d9b8193cb9193ee038a09b67d2b67fbf335f0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,8 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa37bf3d4029c459421564d5270f4c0..6947feae5b27ffbc61666f27fda384d346b982ba 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -632,7 +632,9 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 			}
 		}
 #endif
-		list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(sp, &net->sctp.auto_asconf_splist,
+					auto_asconf_list) {
 			struct sock *sk;
 
 			sk = sctp_opt2sk(sp);
@@ -644,6 +646,7 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 				pr_debug("%s: sctp_asconf_mgmt failed\n", __func__);
 			bh_unlock_sock(sk);
 		}
+		rcu_read_unlock();
 #if IS_ENABLED(CONFIG_IPV6)
 free_next:
 #endif
@@ -1268,6 +1271,7 @@ static int __net_init sctp_net_init(struct net *net)
 	/* Initialize the local address list. */
 	INIT_LIST_HEAD(&net->sctp.local_addr_list);
 	spin_lock_init(&net->sctp.local_addr_lock);
+	spin_lock_init(&net->sctp.auto_asconf_lock);
 	sctp_get_local_addr_list(net);
 
 	/* Initialize the address event list */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..acc0fb0deed0e801bbb8460af0a965aedfdd0348 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3580,14 +3580,16 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock(&sock_net(sk)->sctp.auto_asconf_lock);
 	if (val == 0 && sp->do_auto_asconf) {
-		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
+		list_del_rcu(&sp->auto_asconf_list);
 	} else if (val && !sp->do_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &sock_net(sk)->sctp.auto_asconf_splist);
 	}
+	spin_unlock(&sock_net(sk)->sctp.auto_asconf_lock);
 	return 0;
 }
 
@@ -4122,9 +4124,11 @@ static int sctp_init_sock(struct sock *sk)
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
 	if (net->sctp.default_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &net->sctp.auto_asconf_splist);
+		spin_lock(&net->sctp.auto_asconf_lock);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &net->sctp.auto_asconf_splist);
+		spin_unlock(&net->sctp.auto_asconf_lock);
 	} else
 		sp->do_auto_asconf = 0;
 	local_bh_enable();
@@ -4148,8 +4152,10 @@ static void sctp_destroy_sock(struct sock *sk)
 		return;
 
 	if (sp->do_auto_asconf) {
+		spin_lock(&sock_net(sk)->sctp.auto_asconf_lock);
 		sp->do_auto_asconf = 0;
-		list_del(&sp->auto_asconf_list);
+		list_del_rcu(&sp->auto_asconf_list);
+		spin_unlock(&sock_net(sk)->sctp.auto_asconf_lock);
 	}
 	sctp_endpoint_free(sp->ep);
 	local_bh_disable();
@@ -7195,6 +7201,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7228,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7235,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28  0:52 [PATCH] sctp: fix ASCONF list handling mleitner
@ 2015-05-28 10:15 ` Neil Horman
  2015-05-28 11:17   ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-05-28 10:15 UTC (permalink / raw)
  To: mleitner; +Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> ->auto_asconf_splist is per namespace and mangled by functions like
> sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> 
> Also, the call to inet_sk_copy_descendant() was backuping
> ->auto_asconf_list through the copy but was not honoring
> ->do_auto_asconf, which could lead to list corruption if it was
> different between both sockets.
> 
> This commit thus fixes the list handling by adding a spinlock to protect
> against multiple writers and converts the list to be protected by RCU
> too, so that we don't have a lock inverstion issue at
> sctp_addr_wq_timeout_handler().
> 
> And as this list now uses RCU, we cannot do such backup and restore
> while copying descendant data anymore as readers may be traversing the
> list meanwhile. We fix this by simply ignoring/not copying those fields,
> placed at the end of struct sctp_sock, so we can just ignore it together
> with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> so we don't clutter inet_sk_copy_descendant() with SCTP info.
> 
> Issue was found with a test application that kept flipping sysctl
> default_auto_asconf on and off.
> 
> Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
>  include/net/netns/sctp.h   |  6 +++++-
>  include/net/sctp/structs.h |  2 ++
>  net/sctp/protocol.c        |  6 +++++-
>  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
>  4 files changed, 38 insertions(+), 15 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -30,12 +30,15 @@ struct netns_sctp {
>  	struct list_head local_addr_list;
>  	struct list_head addr_waitq;
>  	struct timer_list addr_wq_timer;
> -	struct list_head auto_asconf_splist;
> +	struct list_head __rcu auto_asconf_splist;
You should use the addr_wq_lock here instead of creating a new lock, as thats
already used to protect most accesses to the list you are concerned about.
Though truthfully, that shouldn't be necessecary.  The list in question is only
read in one location and only written in one location.  You can likely just
rcu-ify, as the write side is in process context and protected by lock_sock.

Neil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28 10:15 ` Neil Horman
@ 2015-05-28 11:17   ` Marcelo Ricardo Leitner
  2015-05-28 13:27     ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-05-28 11:17 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > 
> > ->auto_asconf_splist is per namespace and mangled by functions like
> > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > 
> > Also, the call to inet_sk_copy_descendant() was backuping
> > ->auto_asconf_list through the copy but was not honoring
> > ->do_auto_asconf, which could lead to list corruption if it was
> > different between both sockets.
> > 
> > This commit thus fixes the list handling by adding a spinlock to protect
> > against multiple writers and converts the list to be protected by RCU
> > too, so that we don't have a lock inverstion issue at
> > sctp_addr_wq_timeout_handler().
> > 
> > And as this list now uses RCU, we cannot do such backup and restore
> > while copying descendant data anymore as readers may be traversing the
> > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > placed at the end of struct sctp_sock, so we can just ignore it together
> > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > 
> > Issue was found with a test application that kept flipping sysctl
> > default_auto_asconf on and off.
> > 
> > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > ---
> >  include/net/netns/sctp.h   |  6 +++++-
> >  include/net/sctp/structs.h |  2 ++
> >  net/sctp/protocol.c        |  6 +++++-
> >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> >  4 files changed, 38 insertions(+), 15 deletions(-)
> > 
> > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > --- a/include/net/netns/sctp.h
> > +++ b/include/net/netns/sctp.h
> > @@ -30,12 +30,15 @@ struct netns_sctp {
> >  	struct list_head local_addr_list;
> >  	struct list_head addr_waitq;
> >  	struct timer_list addr_wq_timer;
> > -	struct list_head auto_asconf_splist;
> > +	struct list_head __rcu auto_asconf_splist;
> You should use the addr_wq_lock here instead of creating a new lock, as thats
> already used to protect most accesses to the list you are concerned about.

Ok, that works too.

> Though truthfully, that shouldn't be necessecary.  The list in question is only
> read in one location and only written in one location.  You can likely just
> rcu-ify, as the write side is in process context and protected by lock_sock.

It should, it's not protected by lock_sock as this list resides in
netns_sctp structure, which lock_sock doesn't cover. Write side is in
process context yes, but this list is written in sctp_init_sock(),
sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
trigger this by either creating/destroying sockets if
default_auto_asconf=1 or just by creating a bunch of sockets and
flipping asconf via setsockopt (or a combination of these operations).
(I'll point this out in the changelog)

Btw I have two nits on the patch kindly broght to my attention already,
on adding blank newline and bad comment block, will fix it in v2.

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28 11:17   ` Marcelo Ricardo Leitner
@ 2015-05-28 13:27     ` Marcelo Ricardo Leitner
  2015-05-28 14:31       ` Neil Horman
  2015-05-28 14:46       ` Marcelo Ricardo Leitner
  0 siblings, 2 replies; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-05-28 13:27 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > 
> > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > 
> > > Also, the call to inet_sk_copy_descendant() was backuping
> > > ->auto_asconf_list through the copy but was not honoring
> > > ->do_auto_asconf, which could lead to list corruption if it was
> > > different between both sockets.
> > > 
> > > This commit thus fixes the list handling by adding a spinlock to protect
> > > against multiple writers and converts the list to be protected by RCU
> > > too, so that we don't have a lock inverstion issue at
> > > sctp_addr_wq_timeout_handler().
> > > 
> > > And as this list now uses RCU, we cannot do such backup and restore
> > > while copying descendant data anymore as readers may be traversing the
> > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > 
> > > Issue was found with a test application that kept flipping sysctl
> > > default_auto_asconf on and off.
> > > 
> > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > ---
> > >  include/net/netns/sctp.h   |  6 +++++-
> > >  include/net/sctp/structs.h |  2 ++
> > >  net/sctp/protocol.c        |  6 +++++-
> > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > --- a/include/net/netns/sctp.h
> > > +++ b/include/net/netns/sctp.h
> > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > >  	struct list_head local_addr_list;
> > >  	struct list_head addr_waitq;
> > >  	struct timer_list addr_wq_timer;
> > > -	struct list_head auto_asconf_splist;
> > > +	struct list_head __rcu auto_asconf_splist;
> > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > already used to protect most accesses to the list you are concerned about.
> 
> Ok, that works too.
> 
> > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > read in one location and only written in one location.  You can likely just
> > rcu-ify, as the write side is in process context and protected by lock_sock.
> 
> It should, it's not protected by lock_sock as this list resides in
> netns_sctp structure, which lock_sock doesn't cover. Write side is in
> process context yes, but this list is written in sctp_init_sock(),
> sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> trigger this by either creating/destroying sockets if
> default_auto_asconf=1 or just by creating a bunch of sockets and
> flipping asconf via setsockopt (or a combination of these operations).
> (I'll point this out in the changelog)

Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
reader is inside that lock too, so I can just protect auto_asconf_splist
writers with addr_wq_lock.

Nice, thanks Neil.

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28 13:27     ` Marcelo Ricardo Leitner
@ 2015-05-28 14:31       ` Neil Horman
  2015-05-28 14:46       ` Marcelo Ricardo Leitner
  1 sibling, 0 replies; 34+ messages in thread
From: Neil Horman @ 2015-05-28 14:31 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > 
> > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > 
> > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > ->auto_asconf_list through the copy but was not honoring
> > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > different between both sockets.
> > > > 
> > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > against multiple writers and converts the list to be protected by RCU
> > > > too, so that we don't have a lock inverstion issue at
> > > > sctp_addr_wq_timeout_handler().
> > > > 
> > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > while copying descendant data anymore as readers may be traversing the
> > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > 
> > > > Issue was found with a test application that kept flipping sysctl
> > > > default_auto_asconf on and off.
> > > > 
> > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > ---
> > > >  include/net/netns/sctp.h   |  6 +++++-
> > > >  include/net/sctp/structs.h |  2 ++
> > > >  net/sctp/protocol.c        |  6 +++++-
> > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > --- a/include/net/netns/sctp.h
> > > > +++ b/include/net/netns/sctp.h
> > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > >  	struct list_head local_addr_list;
> > > >  	struct list_head addr_waitq;
> > > >  	struct timer_list addr_wq_timer;
> > > > -	struct list_head auto_asconf_splist;
> > > > +	struct list_head __rcu auto_asconf_splist;
> > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > already used to protect most accesses to the list you are concerned about.
> > 
> > Ok, that works too.
> > 
> > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > read in one location and only written in one location.  You can likely just
> > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > 
> > It should, it's not protected by lock_sock as this list resides in
> > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > process context yes, but this list is written in sctp_init_sock(),
> > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > trigger this by either creating/destroying sockets if
> > default_auto_asconf=1 or just by creating a bunch of sockets and
> > flipping asconf via setsockopt (or a combination of these operations).
> > (I'll point this out in the changelog)
> 
> Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> reader is inside that lock too, so I can just protect auto_asconf_splist
> writers with addr_wq_lock.
> 
> Nice, thanks Neil.
> 
Yup, thanks!
Neil

>   Marcelo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28 13:27     ` Marcelo Ricardo Leitner
  2015-05-28 14:31       ` Neil Horman
@ 2015-05-28 14:46       ` Marcelo Ricardo Leitner
  2015-05-29 13:17         ` Neil Horman
  1 sibling, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-05-28 14:46 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > 
> > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > 
> > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > ->auto_asconf_list through the copy but was not honoring
> > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > different between both sockets.
> > > > 
> > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > against multiple writers and converts the list to be protected by RCU
> > > > too, so that we don't have a lock inverstion issue at
> > > > sctp_addr_wq_timeout_handler().
> > > > 
> > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > while copying descendant data anymore as readers may be traversing the
> > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > 
> > > > Issue was found with a test application that kept flipping sysctl
> > > > default_auto_asconf on and off.
> > > > 
> > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > ---
> > > >  include/net/netns/sctp.h   |  6 +++++-
> > > >  include/net/sctp/structs.h |  2 ++
> > > >  net/sctp/protocol.c        |  6 +++++-
> > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > --- a/include/net/netns/sctp.h
> > > > +++ b/include/net/netns/sctp.h
> > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > >  	struct list_head local_addr_list;
> > > >  	struct list_head addr_waitq;
> > > >  	struct timer_list addr_wq_timer;
> > > > -	struct list_head auto_asconf_splist;
> > > > +	struct list_head __rcu auto_asconf_splist;
> > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > already used to protect most accesses to the list you are concerned about.
> > 
> > Ok, that works too.
> > 
> > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > read in one location and only written in one location.  You can likely just
> > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > 
> > It should, it's not protected by lock_sock as this list resides in
> > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > process context yes, but this list is written in sctp_init_sock(),
> > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > trigger this by either creating/destroying sockets if
> > default_auto_asconf=1 or just by creating a bunch of sockets and
> > flipping asconf via setsockopt (or a combination of these operations).
> > (I'll point this out in the changelog)
> 
> Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> reader is inside that lock too, so I can just protect auto_asconf_splist
> writers with addr_wq_lock.
> 
> Nice, thanks Neil.

Cannot really do that.. as that creates a lock inversion between
sctp_destroy_sock() (which already holds lock_sock) and
sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
locks socket by socket.

Due to that, I'm afraid reusing this lock is not possible, and we should
stick with the patch.. what do you think? (though I have to fix the nits
in there)

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-28 14:46       ` Marcelo Ricardo Leitner
@ 2015-05-29 13:17         ` Neil Horman
  2015-05-29 16:50           ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-05-29 13:17 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > 
> > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > 
> > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > different between both sockets.
> > > > > 
> > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > against multiple writers and converts the list to be protected by RCU
> > > > > too, so that we don't have a lock inverstion issue at
> > > > > sctp_addr_wq_timeout_handler().
> > > > > 
> > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > while copying descendant data anymore as readers may be traversing the
> > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > 
> > > > > Issue was found with a test application that kept flipping sysctl
> > > > > default_auto_asconf on and off.
> > > > > 
> > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > ---
> > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > >  include/net/sctp/structs.h |  2 ++
> > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > --- a/include/net/netns/sctp.h
> > > > > +++ b/include/net/netns/sctp.h
> > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > >  	struct list_head local_addr_list;
> > > > >  	struct list_head addr_waitq;
> > > > >  	struct timer_list addr_wq_timer;
> > > > > -	struct list_head auto_asconf_splist;
> > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > already used to protect most accesses to the list you are concerned about.
> > > 
> > > Ok, that works too.
> > > 
> > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > read in one location and only written in one location.  You can likely just
> > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > 
> > > It should, it's not protected by lock_sock as this list resides in
> > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > process context yes, but this list is written in sctp_init_sock(),
> > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > trigger this by either creating/destroying sockets if
> > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > flipping asconf via setsockopt (or a combination of these operations).
> > > (I'll point this out in the changelog)
> > 
> > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > reader is inside that lock too, so I can just protect auto_asconf_splist
> > writers with addr_wq_lock.
> > 
> > Nice, thanks Neil.
> 
> Cannot really do that.. as that creates a lock inversion between
> sctp_destroy_sock() (which already holds lock_sock) and
> sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> locks socket by socket.
> 
> Due to that, I'm afraid reusing this lock is not possible, and we should
> stick with the patch.. what do you think? (though I have to fix the nits
> in there)
> 
I don't think thats accurate.  You are correct in that the the locks are taken
in opposing order, which would imply a lock inversion that could result in
deadlock, but we can avoid that by deferring the asconf list removal until after
sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
the lock ordering consistent.  Alternatively, we can pre-emptively take the
asconf_lock in sctp_close before locking the socket.

I'd really rather avoid creating an additional lock here if we don't have to
Neil

>   Marcelo
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-29 13:17         ` Neil Horman
@ 2015-05-29 16:50           ` Marcelo Ricardo Leitner
  2015-06-01 14:00             ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-05-29 16:50 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > >
> > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > >
> > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > different between both sockets.
> > > > > >
> > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > sctp_addr_wq_timeout_handler().
> > > > > >
> > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > >
> > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > default_auto_asconf on and off.
> > > > > >
> > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > ---
> > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > >
> > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > --- a/include/net/netns/sctp.h
> > > > > > +++ b/include/net/netns/sctp.h
> > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > >  	struct list_head local_addr_list;
> > > > > >  	struct list_head addr_waitq;
> > > > > >  	struct timer_list addr_wq_timer;
> > > > > > -	struct list_head auto_asconf_splist;
> > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > already used to protect most accesses to the list you are concerned about.
> > > >
> > > > Ok, that works too.
> > > >
> > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > read in one location and only written in one location.  You can likely just
> > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > >
> > > > It should, it's not protected by lock_sock as this list resides in
> > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > process context yes, but this list is written in sctp_init_sock(),
> > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > trigger this by either creating/destroying sockets if
> > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > (I'll point this out in the changelog)
> > >
> > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > writers with addr_wq_lock.
> > >
> > > Nice, thanks Neil.
> >
> > Cannot really do that.. as that creates a lock inversion between
> > sctp_destroy_sock() (which already holds lock_sock) and
> > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > locks socket by socket.
> >
> > Due to that, I'm afraid reusing this lock is not possible, and we should
> > stick with the patch.. what do you think? (though I have to fix the nits
> > in there)
> >
> I don't think thats accurate.  You are correct in that the the locks are taken
> in opposing order, which would imply a lock inversion that could result in
> deadlock, but we can avoid that by deferring the asconf list removal until after
> sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> the lock ordering consistent.  Alternatively, we can pre-emptively take the
> asconf_lock in sctp_close before locking the socket.

For your first approach, deferring the asconf list removal, we can only
do that reliably via some work queue, because we initialize asconf stuff
on sctp_init_sock() and it should be de-initialized on its counterpart,
sctp_destroy_sock(), as we have code like:

(same for ipv4)
sctp_v6_create_accept_sk()
{
...
        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
                newsk = NULL;
        }
...
}

and at inet6_create() too:
        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err)
                        sk_common_release(sk);
        }

Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
initializing asconf and move asconf stuff from sctp_destroy_sock() to
sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
having that similarity.

If we try to lock addr_wq_lock early in sctp_close(),
sctp_destroy_sock() would be unprotected on above situations, but if we
know that sctp_init_sock() won't fail after initilizating asconf, it
wouldn't be a problem...

> I'd really rather avoid creating an additional lock here if we don't have to

I understand. I'm just not seeing another way out so far... I'll keep
trying, but please I'm all ears to ideas ;)

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-05-29 16:50           ` Marcelo Ricardo Leitner
@ 2015-06-01 14:00             ` Neil Horman
  2015-06-01 22:28               ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-06-01 14:00 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > >
> > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > >
> > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > different between both sockets.
> > > > > > >
> > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > >
> > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > >
> > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > default_auto_asconf on and off.
> > > > > > >
> > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > ---
> > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > >  	struct list_head local_addr_list;
> > > > > > >  	struct list_head addr_waitq;
> > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > already used to protect most accesses to the list you are concerned about.
> > > > >
> > > > > Ok, that works too.
> > > > >
> > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > read in one location and only written in one location.  You can likely just
> > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > >
> > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > trigger this by either creating/destroying sockets if
> > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > (I'll point this out in the changelog)
> > > >
> > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > writers with addr_wq_lock.
> > > >
> > > > Nice, thanks Neil.
> > >
> > > Cannot really do that.. as that creates a lock inversion between
> > > sctp_destroy_sock() (which already holds lock_sock) and
> > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > locks socket by socket.
> > >
> > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > stick with the patch.. what do you think? (though I have to fix the nits
> > > in there)
> > >
> > I don't think thats accurate.  You are correct in that the the locks are taken
> > in opposing order, which would imply a lock inversion that could result in
> > deadlock, but we can avoid that by deferring the asconf list removal until after
> > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > asconf_lock in sctp_close before locking the socket.
> 
> For your first approach, deferring the asconf list removal, we can only
> do that reliably via some work queue, because we initialize asconf stuff
> on sctp_init_sock() and it should be de-initialized on its counterpart,
> sctp_destroy_sock(), as we have code like:
> 
> (same for ipv4)
> sctp_v6_create_accept_sk()
> {
> ...
>         if (newsk->sk_prot->init(newsk)) {
>                 sk_common_release(newsk);
>                 newsk = NULL;
>         }
> ...
> }
> 
> and at inet6_create() too:
>         if (sk->sk_prot->init) {
>                 err = sk->sk_prot->init(sk);
>                 if (err)
>                         sk_common_release(sk);
>         }
> 
> Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> initializing asconf and move asconf stuff from sctp_destroy_sock() to
> sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> having that similarity.
> 
> If we try to lock addr_wq_lock early in sctp_close(),
> sctp_destroy_sock() would be unprotected on above situations, but if we
> know that sctp_init_sock() won't fail after initilizating asconf, it
> wouldn't be a problem...
> 
> > I'd really rather avoid creating an additional lock here if we don't have to
> 
> I understand. I'm just not seeing another way out so far... I'll keep
> trying, but please I'm all ears to ideas ;)
> 
>   Marcelo
rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
modification sites, as those are all handled at locations that already hold a
socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
rcu_read_lock_bh call, breaking the lock inversion
Neil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-06-01 14:00             ` Neil Horman
@ 2015-06-01 22:28               ` Marcelo Ricardo Leitner
  2015-06-02 13:48                 ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-01 22:28 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Mon, Jun 01, 2015 at 10:00:28AM -0400, Neil Horman wrote:
> On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> > On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > >
> > > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > > >
> > > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > > different between both sockets.
> > > > > > > >
> > > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > > >
> > > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > > >
> > > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > > default_auto_asconf on and off.
> > > > > > > >
> > > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > ---
> > > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > > >  	struct list_head local_addr_list;
> > > > > > > >  	struct list_head addr_waitq;
> > > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > > already used to protect most accesses to the list you are concerned about.
> > > > > >
> > > > > > Ok, that works too.
> > > > > >
> > > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > > read in one location and only written in one location.  You can likely just
> > > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > > >
> > > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > > trigger this by either creating/destroying sockets if
> > > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > > (I'll point this out in the changelog)
> > > > >
> > > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > > writers with addr_wq_lock.
> > > > >
> > > > > Nice, thanks Neil.
> > > >
> > > > Cannot really do that.. as that creates a lock inversion between
> > > > sctp_destroy_sock() (which already holds lock_sock) and
> > > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > > locks socket by socket.
> > > >
> > > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > > stick with the patch.. what do you think? (though I have to fix the nits
> > > > in there)
> > > >
> > > I don't think thats accurate.  You are correct in that the the locks are taken
> > > in opposing order, which would imply a lock inversion that could result in
> > > deadlock, but we can avoid that by deferring the asconf list removal until after
> > > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > > asconf_lock in sctp_close before locking the socket.
> > 
> > For your first approach, deferring the asconf list removal, we can only
> > do that reliably via some work queue, because we initialize asconf stuff
> > on sctp_init_sock() and it should be de-initialized on its counterpart,
> > sctp_destroy_sock(), as we have code like:
> > 
> > (same for ipv4)
> > sctp_v6_create_accept_sk()
> > {
> > ...
> >         if (newsk->sk_prot->init(newsk)) {
> >                 sk_common_release(newsk);
> >                 newsk = NULL;
> >         }
> > ...
> > }
> > 
> > and at inet6_create() too:
> >         if (sk->sk_prot->init) {
> >                 err = sk->sk_prot->init(sk);
> >                 if (err)
> >                         sk_common_release(sk);
> >         }
> > 
> > Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> > initializing asconf and move asconf stuff from sctp_destroy_sock() to
> > sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> > having that similarity.
> > 
> > If we try to lock addr_wq_lock early in sctp_close(),
> > sctp_destroy_sock() would be unprotected on above situations, but if we
> > know that sctp_init_sock() won't fail after initilizating asconf, it
> > wouldn't be a problem...
> > 
> > > I'd really rather avoid creating an additional lock here if we don't have to
> > 
> > I understand. I'm just not seeing another way out so far... I'll keep
> > trying, but please I'm all ears to ideas ;)
> > 
> >   Marcelo
> rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
> modification sites, as those are all handled at locations that already hold a
> socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
> auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
> rcu_read_lock_bh call, breaking the lock inversion
> Neil

That's still the lock inversion, because at
sctp_addr_wq_timeout_handler() we are not grabbing addr_wq_lock just for
this patch, it's already there and later inside that for_each it will
lock socket by socket; while on sctp_destroy_sock() , it is called with
socket lock already held and we would be trying to grab ->addr_wq_lock
in order to protect the list change.

We could rcu-ify addr_waitq too. Let sctp_addr_wq_timeout_handler() work
on the list protected by rcu_read_lock_bh() and only at the bottom of
that for() we grab addr_wq_lock and do the list change. Would require
more changes than just that in order to work properly (and avoid
double-frees through that timer and sctp_free_addr_wq() for example)

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] sctp: fix ASCONF list handling
  2015-06-01 22:28               ` Marcelo Ricardo Leitner
@ 2015-06-02 13:48                 ` Neil Horman
  2015-06-03 16:54                   ` [PATCH v2 1/2] sctp: rcu-ify addr_waitq mleitner
  2015-06-03 16:54                   ` [PATCH v2 " mleitner
  0 siblings, 2 replies; 34+ messages in thread
From: Neil Horman @ 2015-06-02 13:48 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Mon, Jun 01, 2015 at 07:28:21PM -0300, Marcelo Ricardo Leitner wrote:
> On Mon, Jun 01, 2015 at 10:00:28AM -0400, Neil Horman wrote:
> > On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> > > On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > > > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > >
> > > > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > > > >
> > > > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > > > different between both sockets.
> > > > > > > > >
> > > > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > > > >
> > > > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > > > >
> > > > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > > > default_auto_asconf on and off.
> > > > > > > > >
> > > > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > > ---
> > > > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > > > >  	struct list_head local_addr_list;
> > > > > > > > >  	struct list_head addr_waitq;
> > > > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > > > already used to protect most accesses to the list you are concerned about.
> > > > > > >
> > > > > > > Ok, that works too.
> > > > > > >
> > > > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > > > read in one location and only written in one location.  You can likely just
> > > > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > > > >
> > > > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > > > trigger this by either creating/destroying sockets if
> > > > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > > > (I'll point this out in the changelog)
> > > > > >
> > > > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > > > writers with addr_wq_lock.
> > > > > >
> > > > > > Nice, thanks Neil.
> > > > >
> > > > > Cannot really do that.. as that creates a lock inversion between
> > > > > sctp_destroy_sock() (which already holds lock_sock) and
> > > > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > > > locks socket by socket.
> > > > >
> > > > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > > > stick with the patch.. what do you think? (though I have to fix the nits
> > > > > in there)
> > > > >
> > > > I don't think thats accurate.  You are correct in that the the locks are taken
> > > > in opposing order, which would imply a lock inversion that could result in
> > > > deadlock, but we can avoid that by deferring the asconf list removal until after
> > > > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > > > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > > > asconf_lock in sctp_close before locking the socket.
> > > 
> > > For your first approach, deferring the asconf list removal, we can only
> > > do that reliably via some work queue, because we initialize asconf stuff
> > > on sctp_init_sock() and it should be de-initialized on its counterpart,
> > > sctp_destroy_sock(), as we have code like:
> > > 
> > > (same for ipv4)
> > > sctp_v6_create_accept_sk()
> > > {
> > > ...
> > >         if (newsk->sk_prot->init(newsk)) {
> > >                 sk_common_release(newsk);
> > >                 newsk = NULL;
> > >         }
> > > ...
> > > }
> > > 
> > > and at inet6_create() too:
> > >         if (sk->sk_prot->init) {
> > >                 err = sk->sk_prot->init(sk);
> > >                 if (err)
> > >                         sk_common_release(sk);
> > >         }
> > > 
> > > Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> > > initializing asconf and move asconf stuff from sctp_destroy_sock() to
> > > sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> > > having that similarity.
> > > 
> > > If we try to lock addr_wq_lock early in sctp_close(),
> > > sctp_destroy_sock() would be unprotected on above situations, but if we
> > > know that sctp_init_sock() won't fail after initilizating asconf, it
> > > wouldn't be a problem...
> > > 
> > > > I'd really rather avoid creating an additional lock here if we don't have to
> > > 
> > > I understand. I'm just not seeing another way out so far... I'll keep
> > > trying, but please I'm all ears to ideas ;)
> > > 
> > >   Marcelo
> > rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
> > modification sites, as those are all handled at locations that already hold a
> > socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
> > auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
> > rcu_read_lock_bh call, breaking the lock inversion
> > Neil
> 
> That's still the lock inversion, because at
> sctp_addr_wq_timeout_handler() we are not grabbing addr_wq_lock just for
> this patch, it's already there and later inside that for_each it will
> lock socket by socket; while on sctp_destroy_sock() , it is called with
> socket lock already held and we would be trying to grab ->addr_wq_lock
> in order to protect the list change.
> 
I get this, thats why I suggested rcu-ifying the list, because those loops are
read-only sites, you can replace the spinlock with an rcu lock.

> We could rcu-ify addr_waitq too. Let sctp_addr_wq_timeout_handler() work
> on the list protected by rcu_read_lock_bh() and only at the bottom of
> that for() we grab addr_wq_lock and do the list change. Would require
> more changes than just that in order to work properly (and avoid
> double-frees through that timer and sctp_free_addr_wq() for example)
> 
Ah, sorry, yes I missed the list_del at the bottom.  You're correct though, if
you rcu-ify both lists, you can just lock the addr_wq_lock there in the proper
order and do the list modification under protection.  Double free protection
shouldn't be hard there, just add another list iterator that searches for an
element who's pointer is the same as the element you want to remove.  Only one
context should ever find it if its being removed.

Regards
Neil

>   Marcelo
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/2] sctp: rcu-ify addr_waitq
  2015-06-02 13:48                 ` Neil Horman
@ 2015-06-03 16:54                   ` mleitner
  2015-06-03 17:19                     ` Marcelo Ricardo Leitner
  2015-06-04 14:27                     ` Neil Horman
  2015-06-03 16:54                   ` [PATCH v2 " mleitner
  1 sibling, 2 replies; 34+ messages in thread
From: mleitner @ 2015-06-03 16:54 UTC (permalink / raw)
  To: Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

That's needed for the next patch, so we break the lock inversion between
netns_sctp->addr_wq_lock and socket lock on
sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
without taking addr_wq_lock, taking it just for the write operations.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Notes:
    v1->v2:
     As asked by Neil, this now reuses addr_wq_lock. And for that, also
     rcu-ifyies addr_waitq.

 include/net/netns/sctp.h |  2 +-
 net/sctp/protocol.c      | 81 +++++++++++++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -28,7 +28,7 @@ struct netns_sctp {
 	 * It is a list of sctp_sockaddr_entry.
 	 */
 	struct list_head local_addr_list;
-	struct list_head addr_waitq;
+	struct list_head __rcu addr_waitq;
 	struct timer_list addr_wq_timer;
 	struct list_head auto_asconf_splist;
 	spinlock_t addr_wq_lock;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa37bf3d4029c459421564d5270f4c0..a5089883b28195f3aef69ef35b5397322a01126f 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -593,15 +593,46 @@ static void sctp_v4_ecn_capable(struct sock *sk)
 	INET_ECN_xmit(sk);
 }
 
+static void sctp_free_addr_wq(struct net *net)
+{
+	struct sctp_sockaddr_entry *addrw;
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	del_timer(&net->sctp.addr_wq_timer);
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
+		list_del_rcu(&addrw->list);
+		kfree_rcu(addrw, rcu);
+	}
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
+/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
+ * the lock if it wasn't removed from addr_waitq already, otherwise we
+ * could double-free it.
+ */
+static void sctp_free_addr_wq_entry(struct net *net,
+				    struct sctp_sockaddr_entry *addrw)
+{
+	struct sctp_sockaddr_entry *temp;
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
+		if (temp == addrw) {
+			list_del_rcu(&addrw->list);
+			kfree_rcu(addrw, rcu);
+		}
+	}
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
 static void sctp_addr_wq_timeout_handler(unsigned long arg)
 {
 	struct net *net = (struct net *)arg;
-	struct sctp_sockaddr_entry *addrw, *temp;
+	struct sctp_sockaddr_entry *addrw;
 	struct sctp_sock *sp;
 
-	spin_lock_bh(&net->sctp.addr_wq_lock);
-
-	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
+	rcu_read_lock_bh();
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
 		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
 			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
 			 addrw->state, addrw);
@@ -627,7 +658,9 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 
 				timeo_val = jiffies;
 				timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+				spin_lock_bh(&net->sctp.addr_wq_lock);
 				mod_timer(&net->sctp.addr_wq_timer, timeo_val);
+				spin_unlock_bh(&net->sctp.addr_wq_lock);
 				break;
 			}
 		}
@@ -647,35 +680,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 #if IS_ENABLED(CONFIG_IPV6)
 free_next:
 #endif
-		list_del(&addrw->list);
-		kfree(addrw);
-	}
-	spin_unlock_bh(&net->sctp.addr_wq_lock);
-}
-
-static void sctp_free_addr_wq(struct net *net)
-{
-	struct sctp_sockaddr_entry *addrw;
-	struct sctp_sockaddr_entry *temp;
-
-	spin_lock_bh(&net->sctp.addr_wq_lock);
-	del_timer(&net->sctp.addr_wq_timer);
-	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
-		list_del(&addrw->list);
-		kfree(addrw);
+		sctp_free_addr_wq_entry(net, addrw);
 	}
-	spin_unlock_bh(&net->sctp.addr_wq_lock);
+	rcu_read_unlock_bh();
 }
 
 /* lookup the entry for the same address in the addr_waitq
- * sctp_addr_wq MUST be locked
+ * rcu read MUST be locked
  */
 static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
 					struct sctp_sockaddr_entry *addr)
 {
 	struct sctp_sockaddr_entry *addrw;
 
-	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
 		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
 			continue;
 		if (addrw->a.sa.sa_family == AF_INET) {
@@ -702,7 +720,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
 	 * new address after a couple of addition and deletion of that address
 	 */
 
-	spin_lock_bh(&net->sctp.addr_wq_lock);
+	rcu_read_lock_bh();
 	/* Offsets existing events in addr_wq */
 	addrw = sctp_addr_wq_lookup(net, addr);
 	if (addrw) {
@@ -710,22 +728,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
 			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
 				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
 				 &net->sctp.addr_waitq);
-
-			list_del(&addrw->list);
-			kfree(addrw);
+			sctp_free_addr_wq_entry(net, addrw);
 		}
-		spin_unlock_bh(&net->sctp.addr_wq_lock);
+		rcu_read_unlock_bh();
 		return;
 	}
+	rcu_read_unlock_bh();
 
 	/* OK, we have to add the new address to the wait queue */
 	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
-	if (addrw == NULL) {
-		spin_unlock_bh(&net->sctp.addr_wq_lock);
+	if (!addrw)
 		return;
-	}
 	addrw->state = cmd;
-	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
 
 	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
 		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 1/2] sctp: rcu-ify addr_waitq
  2015-06-03 16:54                   ` [PATCH v2 1/2] sctp: rcu-ify addr_waitq mleitner
@ 2015-06-03 17:19                     ` Marcelo Ricardo Leitner
  2015-06-04 14:27                     ` Neil Horman
  1 sibling, 0 replies; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-03 17:19 UTC (permalink / raw)
  To: Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Wed, Jun 03, 2015 at 01:54:01PM -0300, mleitner@redhat.com wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> That's needed for the next patch, so we break the lock inversion between
> netns_sctp->addr_wq_lock and socket lock on
> sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> without taking addr_wq_lock, taking it just for the write operations.
> 
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> 
> Notes:
>     v1->v2:
>      As asked by Neil, this now reuses addr_wq_lock. And for that, also
>      rcu-ifyies addr_waitq.
> 
>  include/net/netns/sctp.h |  2 +-
>  net/sctp/protocol.c      | 81 +++++++++++++++++++++++++++++-------------------
>  2 files changed, 50 insertions(+), 33 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -28,7 +28,7 @@ struct netns_sctp {
>  	 * It is a list of sctp_sockaddr_entry.
>  	 */
>  	struct list_head local_addr_list;
> -	struct list_head addr_waitq;
> +	struct list_head __rcu addr_waitq;
>  	struct timer_list addr_wq_timer;
>  	struct list_head auto_asconf_splist;
>  	spinlock_t addr_wq_lock;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 53b7acde9aa37bf3d4029c459421564d5270f4c0..a5089883b28195f3aef69ef35b5397322a01126f 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -593,15 +593,46 @@ static void sctp_v4_ecn_capable(struct sock *sk)
>  	INET_ECN_xmit(sk);
>  }
>  
> +static void sctp_free_addr_wq(struct net *net)
> +{
> +	struct sctp_sockaddr_entry *addrw;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	del_timer(&net->sctp.addr_wq_timer);
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> +		list_del_rcu(&addrw->list);
> +		kfree_rcu(addrw, rcu);
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
> +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> + * the lock if it wasn't removed from addr_waitq already, otherwise we
> + * could double-free it.
> + */
> +static void sctp_free_addr_wq_entry(struct net *net,
> +				    struct sctp_sockaddr_entry *addrw)
> +{
> +	struct sctp_sockaddr_entry *temp;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
> +		if (temp == addrw) {
> +			list_del_rcu(&addrw->list);
> +			kfree_rcu(addrw, rcu);

Missed a break here, will issue a v3 when you review it too..

  Marcelo

> +		}
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
>  static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  {
>  	struct net *net = (struct net *)arg;
> -	struct sctp_sockaddr_entry *addrw, *temp;
> +	struct sctp_sockaddr_entry *addrw;
>  	struct sctp_sock *sp;
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> +	rcu_read_lock_bh();
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
>  			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
>  			 addrw->state, addrw);
> @@ -627,7 +658,9 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  
>  				timeo_val = jiffies;
>  				timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> +				spin_lock_bh(&net->sctp.addr_wq_lock);
>  				mod_timer(&net->sctp.addr_wq_timer, timeo_val);
> +				spin_unlock_bh(&net->sctp.addr_wq_lock);
>  				break;
>  			}
>  		}
> @@ -647,35 +680,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  #if IS_ENABLED(CONFIG_IPV6)
>  free_next:
>  #endif
> -		list_del(&addrw->list);
> -		kfree(addrw);
> -	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> -}
> -
> -static void sctp_free_addr_wq(struct net *net)
> -{
> -	struct sctp_sockaddr_entry *addrw;
> -	struct sctp_sockaddr_entry *temp;
> -
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -	del_timer(&net->sctp.addr_wq_timer);
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> -		list_del(&addrw->list);
> -		kfree(addrw);
> +		sctp_free_addr_wq_entry(net, addrw);
>  	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_unlock_bh();
>  }
>  
>  /* lookup the entry for the same address in the addr_waitq
> - * sctp_addr_wq MUST be locked
> + * rcu read MUST be locked
>   */
>  static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
>  					struct sctp_sockaddr_entry *addr)
>  {
>  	struct sctp_sockaddr_entry *addrw;
>  
> -	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
>  			continue;
>  		if (addrw->a.sa.sa_family == AF_INET) {
> @@ -702,7 +720,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  	 * new address after a couple of addition and deletion of that address
>  	 */
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_lock_bh();
>  	/* Offsets existing events in addr_wq */
>  	addrw = sctp_addr_wq_lookup(net, addr);
>  	if (addrw) {
> @@ -710,22 +728,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
>  				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
>  				 &net->sctp.addr_waitq);
> -
> -			list_del(&addrw->list);
> -			kfree(addrw);
> +			sctp_free_addr_wq_entry(net, addrw);
>  		}
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +		rcu_read_unlock_bh();
>  		return;
>  	}
> +	rcu_read_unlock_bh();
>  
>  	/* OK, we have to add the new address to the wait queue */
>  	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> -	if (addrw == NULL) {
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	if (!addrw)
>  		return;
> -	}
>  	addrw->state = cmd;
> -	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
>  
>  	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
>  		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
> -- 
> 2.4.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 1/2] sctp: rcu-ify addr_waitq
  2015-06-03 16:54                   ` [PATCH v2 1/2] sctp: rcu-ify addr_waitq mleitner
  2015-06-03 17:19                     ` Marcelo Ricardo Leitner
@ 2015-06-04 14:27                     ` Neil Horman
  2015-06-05 14:18                       ` Marcelo Ricardo Leitner
                                         ` (2 more replies)
  1 sibling, 3 replies; 34+ messages in thread
From: Neil Horman @ 2015-06-04 14:27 UTC (permalink / raw)
  To: mleitner; +Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Wed, Jun 03, 2015 at 01:54:01PM -0300, mleitner@redhat.com wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> That's needed for the next patch, so we break the lock inversion between
> netns_sctp->addr_wq_lock and socket lock on
> sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> without taking addr_wq_lock, taking it just for the write operations.
> 
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> 
> Notes:
>     v1->v2:
>      As asked by Neil, this now reuses addr_wq_lock. And for that, also
>      rcu-ifyies addr_waitq.
> 
>  include/net/netns/sctp.h |  2 +-
>  net/sctp/protocol.c      | 81 +++++++++++++++++++++++++++++-------------------
>  2 files changed, 50 insertions(+), 33 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -28,7 +28,7 @@ struct netns_sctp {
>  	 * It is a list of sctp_sockaddr_entry.
>  	 */
>  	struct list_head local_addr_list;
> -	struct list_head addr_waitq;
> +	struct list_head __rcu addr_waitq;
>  	struct timer_list addr_wq_timer;
>  	struct list_head auto_asconf_splist;
>  	spinlock_t addr_wq_lock;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 53b7acde9aa37bf3d4029c459421564d5270f4c0..a5089883b28195f3aef69ef35b5397322a01126f 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -593,15 +593,46 @@ static void sctp_v4_ecn_capable(struct sock *sk)
>  	INET_ECN_xmit(sk);
>  }
>  
> +static void sctp_free_addr_wq(struct net *net)
> +{
> +	struct sctp_sockaddr_entry *addrw;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	del_timer(&net->sctp.addr_wq_timer);
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> +		list_del_rcu(&addrw->list);
> +		kfree_rcu(addrw, rcu);
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
> +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> + * the lock if it wasn't removed from addr_waitq already, otherwise we
> + * could double-free it.
> + */
> +static void sctp_free_addr_wq_entry(struct net *net,
> +				    struct sctp_sockaddr_entry *addrw)
> +{
> +	struct sctp_sockaddr_entry *temp;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
> +		if (temp == addrw) {
> +			list_del_rcu(&addrw->list);
> +			kfree_rcu(addrw, rcu);
> +		}
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
>  static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  {
>  	struct net *net = (struct net *)arg;
> -	struct sctp_sockaddr_entry *addrw, *temp;
> +	struct sctp_sockaddr_entry *addrw;
>  	struct sctp_sock *sp;
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> +	rcu_read_lock_bh();
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
>  			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
>  			 addrw->state, addrw);
> @@ -627,7 +658,9 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  
>  				timeo_val = jiffies;
>  				timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> +				spin_lock_bh(&net->sctp.addr_wq_lock);
>  				mod_timer(&net->sctp.addr_wq_timer, timeo_val);
> +				spin_unlock_bh(&net->sctp.addr_wq_lock);
Do we actually need to lock the addr_wq_lock here?  mod_timer has its own
internal locking.

>  				break;
>  			}
>  		}
> @@ -647,35 +680,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  #if IS_ENABLED(CONFIG_IPV6)
>  free_next:
>  #endif
> -		list_del(&addrw->list);
> -		kfree(addrw);
> -	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> -}
> -
> -static void sctp_free_addr_wq(struct net *net)
> -{
> -	struct sctp_sockaddr_entry *addrw;
> -	struct sctp_sockaddr_entry *temp;
> -
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -	del_timer(&net->sctp.addr_wq_timer);
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> -		list_del(&addrw->list);
> -		kfree(addrw);
> +		sctp_free_addr_wq_entry(net, addrw);
>  	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_unlock_bh();
>  }
>  
>  /* lookup the entry for the same address in the addr_waitq
> - * sctp_addr_wq MUST be locked
> + * rcu read MUST be locked
>   */
>  static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
>  					struct sctp_sockaddr_entry *addr)
>  {
>  	struct sctp_sockaddr_entry *addrw;
>  
> -	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
>  			continue;
>  		if (addrw->a.sa.sa_family == AF_INET) {
> @@ -702,7 +720,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  	 * new address after a couple of addition and deletion of that address
>  	 */
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_lock_bh();
>  	/* Offsets existing events in addr_wq */
>  	addrw = sctp_addr_wq_lookup(net, addr);
>  	if (addrw) {
> @@ -710,22 +728,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
>  				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
>  				 &net->sctp.addr_waitq);
> -
> -			list_del(&addrw->list);
> -			kfree(addrw);
> +			sctp_free_addr_wq_entry(net, addrw);
>  		}
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +		rcu_read_unlock_bh();
>  		return;
>  	}
> +	rcu_read_unlock_bh();
>  
>  	/* OK, we have to add the new address to the wait queue */
>  	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> -	if (addrw == NULL) {
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	if (!addrw)
>  		return;
> -	}
>  	addrw->state = cmd;
> -	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
>  
>  	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
>  		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);

Other than the comment above, and the break you need to insert, I think this
looks good, thanks for taking the extra time on it!
Best
Neil

> -- 
> 2.4.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 1/2] sctp: rcu-ify addr_waitq
  2015-06-04 14:27                     ` Neil Horman
@ 2015-06-05 14:18                       ` Marcelo Ricardo Leitner
  2015-06-05 17:08                       ` [PATCH v3 " mleitner
  2015-06-05 17:08                       ` [PATCH v3 2/2] " mleitner
  2 siblings, 0 replies; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-05 14:18 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Thu, Jun 04, 2015 at 10:27:10AM -0400, Neil Horman wrote:
> On Wed, Jun 03, 2015 at 01:54:01PM -0300, mleitner@redhat.com wrote:
> > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > 
> > That's needed for the next patch, so we break the lock inversion between
> > netns_sctp->addr_wq_lock and socket lock on
> > sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> > without taking addr_wq_lock, taking it just for the write operations.
> > 
> > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > ---
> > 
> > Notes:
> >     v1->v2:
> >      As asked by Neil, this now reuses addr_wq_lock. And for that, also
> >      rcu-ifyies addr_waitq.
> > 
> >  include/net/netns/sctp.h |  2 +-
> >  net/sctp/protocol.c      | 81 +++++++++++++++++++++++++++++-------------------
> >  2 files changed, 50 insertions(+), 33 deletions(-)
> > 
> > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
> > --- a/include/net/netns/sctp.h
> > +++ b/include/net/netns/sctp.h
> > @@ -28,7 +28,7 @@ struct netns_sctp {
> >  	 * It is a list of sctp_sockaddr_entry.
> >  	 */
> >  	struct list_head local_addr_list;
> > -	struct list_head addr_waitq;
> > +	struct list_head __rcu addr_waitq;
> >  	struct timer_list addr_wq_timer;
> >  	struct list_head auto_asconf_splist;
> >  	spinlock_t addr_wq_lock;
> > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > index 53b7acde9aa37bf3d4029c459421564d5270f4c0..a5089883b28195f3aef69ef35b5397322a01126f 100644
> > --- a/net/sctp/protocol.c
> > +++ b/net/sctp/protocol.c
> > @@ -593,15 +593,46 @@ static void sctp_v4_ecn_capable(struct sock *sk)
> >  	INET_ECN_xmit(sk);
> >  }
> >  
> > +static void sctp_free_addr_wq(struct net *net)
> > +{
> > +	struct sctp_sockaddr_entry *addrw;
> > +
> > +	spin_lock_bh(&net->sctp.addr_wq_lock);
> > +	del_timer(&net->sctp.addr_wq_timer);
> > +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> > +		list_del_rcu(&addrw->list);
> > +		kfree_rcu(addrw, rcu);
> > +	}
> > +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +}
> > +
> > +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> > + * the lock if it wasn't removed from addr_waitq already, otherwise we
> > + * could double-free it.
> > + */
> > +static void sctp_free_addr_wq_entry(struct net *net,
> > +				    struct sctp_sockaddr_entry *addrw)
> > +{
> > +	struct sctp_sockaddr_entry *temp;
> > +
> > +	spin_lock_bh(&net->sctp.addr_wq_lock);
> > +	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
> > +		if (temp == addrw) {
> > +			list_del_rcu(&addrw->list);
> > +			kfree_rcu(addrw, rcu);
> > +		}
> > +	}
> > +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +}
> > +
> >  static void sctp_addr_wq_timeout_handler(unsigned long arg)
> >  {
> >  	struct net *net = (struct net *)arg;
> > -	struct sctp_sockaddr_entry *addrw, *temp;
> > +	struct sctp_sockaddr_entry *addrw;
> >  	struct sctp_sock *sp;
> >  
> > -	spin_lock_bh(&net->sctp.addr_wq_lock);
> > -
> > -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> > +	rcu_read_lock_bh();
> > +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> >  		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
> >  			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
> >  			 addrw->state, addrw);
> > @@ -627,7 +658,9 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
> >  
> >  				timeo_val = jiffies;
> >  				timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> > +				spin_lock_bh(&net->sctp.addr_wq_lock);
> >  				mod_timer(&net->sctp.addr_wq_timer, timeo_val);
> > +				spin_unlock_bh(&net->sctp.addr_wq_lock);
> Do we actually need to lock the addr_wq_lock here?  mod_timer has its own
> internal locking.

No, we don't. I'll remove these.

> >  				break;
> >  			}
> >  		}
> > @@ -647,35 +680,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
> >  #if IS_ENABLED(CONFIG_IPV6)
> >  free_next:
> >  #endif
> > -		list_del(&addrw->list);
> > -		kfree(addrw);
> > -	}
> > -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> > -}
> > -
> > -static void sctp_free_addr_wq(struct net *net)
> > -{
> > -	struct sctp_sockaddr_entry *addrw;
> > -	struct sctp_sockaddr_entry *temp;
> > -
> > -	spin_lock_bh(&net->sctp.addr_wq_lock);
> > -	del_timer(&net->sctp.addr_wq_timer);
> > -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> > -		list_del(&addrw->list);
> > -		kfree(addrw);
> > +		sctp_free_addr_wq_entry(net, addrw);
> >  	}
> > -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +	rcu_read_unlock_bh();
> >  }
> >  
> >  /* lookup the entry for the same address in the addr_waitq
> > - * sctp_addr_wq MUST be locked
> > + * rcu read MUST be locked
> >   */
> >  static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
> >  					struct sctp_sockaddr_entry *addr)
> >  {
> >  	struct sctp_sockaddr_entry *addrw;
> >  
> > -	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
> > +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> >  		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
> >  			continue;
> >  		if (addrw->a.sa.sa_family == AF_INET) {
> > @@ -702,7 +720,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
> >  	 * new address after a couple of addition and deletion of that address
> >  	 */
> >  
> > -	spin_lock_bh(&net->sctp.addr_wq_lock);
> > +	rcu_read_lock_bh();
> >  	/* Offsets existing events in addr_wq */
> >  	addrw = sctp_addr_wq_lookup(net, addr);
> >  	if (addrw) {
> > @@ -710,22 +728,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
> >  			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
> >  				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
> >  				 &net->sctp.addr_waitq);
> > -
> > -			list_del(&addrw->list);
> > -			kfree(addrw);
> > +			sctp_free_addr_wq_entry(net, addrw);
> >  		}
> > -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +		rcu_read_unlock_bh();
> >  		return;
> >  	}
> > +	rcu_read_unlock_bh();
> >  
> >  	/* OK, we have to add the new address to the wait queue */
> >  	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> > -	if (addrw == NULL) {
> > -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +	if (!addrw)
> >  		return;
> > -	}
> >  	addrw->state = cmd;
> > -	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
> > +
> > +	spin_lock_bh(&net->sctp.addr_wq_lock);
> > +	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
> >  
> >  	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
> >  		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
> 
> Other than the comment above, and the break you need to insert, I think this
> looks good, thanks for taking the extra time on it!
> Best
> Neil

Cool, thx! v3 in a few.

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-04 14:27                     ` Neil Horman
  2015-06-05 14:18                       ` Marcelo Ricardo Leitner
@ 2015-06-05 17:08                       ` mleitner
  2015-06-08 13:58                         ` Neil Horman
  2015-06-08 14:46                         ` Hannes Frederic Sowa
  2015-06-05 17:08                       ` [PATCH v3 2/2] " mleitner
  2 siblings, 2 replies; 34+ messages in thread
From: mleitner @ 2015-06-05 17:08 UTC (permalink / raw)
  To: Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

That's needed for the next patch, so we break the lock inversion between
netns_sctp->addr_wq_lock and socket lock on
sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
without taking addr_wq_lock, taking it just for the write operations.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Notes:
    v2->v3:
      placed break statement on sctp_free_addr_wq_entry()
      removed unnecessary spin_lock noticed by Neil

 include/net/netns/sctp.h |  2 +-
 net/sctp/protocol.c      | 80 +++++++++++++++++++++++++++++-------------------
 2 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -28,7 +28,7 @@ struct netns_sctp {
 	 * It is a list of sctp_sockaddr_entry.
 	 */
 	struct list_head local_addr_list;
-	struct list_head addr_waitq;
+	struct list_head __rcu addr_waitq;
 	struct timer_list addr_wq_timer;
 	struct list_head auto_asconf_splist;
 	spinlock_t addr_wq_lock;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct sock *sk)
 	INET_ECN_xmit(sk);
 }
 
+static void sctp_free_addr_wq(struct net *net)
+{
+	struct sctp_sockaddr_entry *addrw;
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	del_timer(&net->sctp.addr_wq_timer);
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
+		list_del_rcu(&addrw->list);
+		kfree_rcu(addrw, rcu);
+	}
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
+/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
+ * the lock if it wasn't removed from addr_waitq already, otherwise we
+ * could double-free it.
+ */
+static void sctp_free_addr_wq_entry(struct net *net,
+				    struct sctp_sockaddr_entry *addrw)
+{
+	struct sctp_sockaddr_entry *temp;
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
+		if (temp == addrw) {
+			list_del_rcu(&addrw->list);
+			kfree_rcu(addrw, rcu);
+			break;
+		}
+	}
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
 static void sctp_addr_wq_timeout_handler(unsigned long arg)
 {
 	struct net *net = (struct net *)arg;
-	struct sctp_sockaddr_entry *addrw, *temp;
+	struct sctp_sockaddr_entry *addrw;
 	struct sctp_sock *sp;
 
-	spin_lock_bh(&net->sctp.addr_wq_lock);
-
-	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
+	rcu_read_lock_bh();
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
 		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
 			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
 			 addrw->state, addrw);
@@ -647,35 +679,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 #if IS_ENABLED(CONFIG_IPV6)
 free_next:
 #endif
-		list_del(&addrw->list);
-		kfree(addrw);
-	}
-	spin_unlock_bh(&net->sctp.addr_wq_lock);
-}
-
-static void sctp_free_addr_wq(struct net *net)
-{
-	struct sctp_sockaddr_entry *addrw;
-	struct sctp_sockaddr_entry *temp;
-
-	spin_lock_bh(&net->sctp.addr_wq_lock);
-	del_timer(&net->sctp.addr_wq_timer);
-	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
-		list_del(&addrw->list);
-		kfree(addrw);
+		sctp_free_addr_wq_entry(net, addrw);
 	}
-	spin_unlock_bh(&net->sctp.addr_wq_lock);
+	rcu_read_unlock_bh();
 }
 
 /* lookup the entry for the same address in the addr_waitq
- * sctp_addr_wq MUST be locked
+ * rcu read MUST be locked
  */
 static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
 					struct sctp_sockaddr_entry *addr)
 {
 	struct sctp_sockaddr_entry *addrw;
 
-	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
+	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
 		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
 			continue;
 		if (addrw->a.sa.sa_family == AF_INET) {
@@ -702,7 +719,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
 	 * new address after a couple of addition and deletion of that address
 	 */
 
-	spin_lock_bh(&net->sctp.addr_wq_lock);
+	rcu_read_lock_bh();
 	/* Offsets existing events in addr_wq */
 	addrw = sctp_addr_wq_lookup(net, addr);
 	if (addrw) {
@@ -710,22 +727,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
 			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
 				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
 				 &net->sctp.addr_waitq);
-
-			list_del(&addrw->list);
-			kfree(addrw);
+			sctp_free_addr_wq_entry(net, addrw);
 		}
-		spin_unlock_bh(&net->sctp.addr_wq_lock);
+		rcu_read_unlock_bh();
 		return;
 	}
+	rcu_read_unlock_bh();
 
 	/* OK, we have to add the new address to the wait queue */
 	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
-	if (addrw == NULL) {
-		spin_unlock_bh(&net->sctp.addr_wq_lock);
+	if (!addrw)
 		return;
-	}
 	addrw->state = cmd;
-	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
+
+	spin_lock_bh(&net->sctp.addr_wq_lock);
+	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
 
 	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
 		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-05 17:08                       ` [PATCH v3 " mleitner
@ 2015-06-08 13:58                         ` Neil Horman
  2015-06-08 14:46                         ` Hannes Frederic Sowa
  1 sibling, 0 replies; 34+ messages in thread
From: Neil Horman @ 2015-06-08 13:58 UTC (permalink / raw)
  To: mleitner; +Cc: netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Fri, Jun 05, 2015 at 02:08:05PM -0300, mleitner@redhat.com wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> That's needed for the next patch, so we break the lock inversion between
> netns_sctp->addr_wq_lock and socket lock on
> sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> without taking addr_wq_lock, taking it just for the write operations.
> 
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> 
> Notes:
>     v2->v3:
>       placed break statement on sctp_free_addr_wq_entry()
>       removed unnecessary spin_lock noticed by Neil
> 
>  include/net/netns/sctp.h |  2 +-
>  net/sctp/protocol.c      | 80 +++++++++++++++++++++++++++++-------------------
>  2 files changed, 49 insertions(+), 33 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75 100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -28,7 +28,7 @@ struct netns_sctp {
>  	 * It is a list of sctp_sockaddr_entry.
>  	 */
>  	struct list_head local_addr_list;
> -	struct list_head addr_waitq;
> +	struct list_head __rcu addr_waitq;
>  	struct timer_list addr_wq_timer;
>  	struct list_head auto_asconf_splist;
>  	spinlock_t addr_wq_lock;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct sock *sk)
>  	INET_ECN_xmit(sk);
>  }
>  
> +static void sctp_free_addr_wq(struct net *net)
> +{
> +	struct sctp_sockaddr_entry *addrw;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	del_timer(&net->sctp.addr_wq_timer);
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> +		list_del_rcu(&addrw->list);
> +		kfree_rcu(addrw, rcu);
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
> +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> + * the lock if it wasn't removed from addr_waitq already, otherwise we
> + * could double-free it.
> + */
> +static void sctp_free_addr_wq_entry(struct net *net,
> +				    struct sctp_sockaddr_entry *addrw)
> +{
> +	struct sctp_sockaddr_entry *temp;
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
> +		if (temp == addrw) {
> +			list_del_rcu(&addrw->list);
> +			kfree_rcu(addrw, rcu);
> +			break;
> +		}
> +	}
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
>  static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  {
>  	struct net *net = (struct net *)arg;
> -	struct sctp_sockaddr_entry *addrw, *temp;
> +	struct sctp_sockaddr_entry *addrw;
>  	struct sctp_sock *sp;
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> +	rcu_read_lock_bh();
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
>  			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
>  			 addrw->state, addrw);
> @@ -647,35 +679,20 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  #if IS_ENABLED(CONFIG_IPV6)
>  free_next:
>  #endif
> -		list_del(&addrw->list);
> -		kfree(addrw);
> -	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> -}
> -
> -static void sctp_free_addr_wq(struct net *net)
> -{
> -	struct sctp_sockaddr_entry *addrw;
> -	struct sctp_sockaddr_entry *temp;
> -
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> -	del_timer(&net->sctp.addr_wq_timer);
> -	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
> -		list_del(&addrw->list);
> -		kfree(addrw);
> +		sctp_free_addr_wq_entry(net, addrw);
>  	}
> -	spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_unlock_bh();
>  }
>  
>  /* lookup the entry for the same address in the addr_waitq
> - * sctp_addr_wq MUST be locked
> + * rcu read MUST be locked
>   */
>  static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
>  					struct sctp_sockaddr_entry *addr)
>  {
>  	struct sctp_sockaddr_entry *addrw;
>  
> -	list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
> +	list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
>  			continue;
>  		if (addrw->a.sa.sa_family == AF_INET) {
> @@ -702,7 +719,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  	 * new address after a couple of addition and deletion of that address
>  	 */
>  
> -	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	rcu_read_lock_bh();
>  	/* Offsets existing events in addr_wq */
>  	addrw = sctp_addr_wq_lookup(net, addr);
>  	if (addrw) {
> @@ -710,22 +727,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm
>  			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
>  				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
>  				 &net->sctp.addr_waitq);
> -
> -			list_del(&addrw->list);
> -			kfree(addrw);
> +			sctp_free_addr_wq_entry(net, addrw);
>  		}
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +		rcu_read_unlock_bh();
>  		return;
>  	}
> +	rcu_read_unlock_bh();
>  
>  	/* OK, we have to add the new address to the wait queue */
>  	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> -	if (addrw == NULL) {
> -		spin_unlock_bh(&net->sctp.addr_wq_lock);
> +	if (!addrw)
>  		return;
> -	}
>  	addrw->state = cmd;
> -	list_add_tail(&addrw->list, &net->sctp.addr_waitq);
> +
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
> +	list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);
>  
>  	pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
>  		 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
> -- 
> 2.4.1
> 
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-05 17:08                       ` [PATCH v3 " mleitner
  2015-06-08 13:58                         ` Neil Horman
@ 2015-06-08 14:46                         ` Hannes Frederic Sowa
  2015-06-08 14:59                           ` Hannes Frederic Sowa
  1 sibling, 1 reply; 34+ messages in thread
From: Hannes Frederic Sowa @ 2015-06-08 14:46 UTC (permalink / raw)
  To: mleitner, Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

Hi Marcelo,

a few hints on rcuification, sorry I reviewed the code so late:

On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> That's needed for the next patch, so we break the lock inversion between
> netns_sctp->addr_wq_lock and socket lock on
> sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> without taking addr_wq_lock, taking it just for the write operations.
> 
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> 
> Notes:
>     v2->v3:
>       placed break statement on sctp_free_addr_wq_entry()
>       removed unnecessary spin_lock noticed by Neil
> 
>  include/net/netns/sctp.h |  2 +-
>  net/sctp/protocol.c      | 80
>  +++++++++++++++++++++++++++++-------------------
>  2 files changed, 49 insertions(+), 33 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index
> 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75
> 100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -28,7 +28,7 @@ struct netns_sctp {
>  	 * It is a list of sctp_sockaddr_entry.
>  	 */
>  	struct list_head local_addr_list;
> -       struct list_head addr_waitq;
> +       struct list_head __rcu addr_waitq;
>  	struct timer_list addr_wq_timer;
>  	struct list_head auto_asconf_splist;
>  	spinlock_t addr_wq_lock;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index
> 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2
> 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct sock *sk)
>  	INET_ECN_xmit(sk);
>  }
>  
> +static void sctp_free_addr_wq(struct net *net)
> +{
> +       struct sctp_sockaddr_entry *addrw;
> +
> +       spin_lock_bh(&net->sctp.addr_wq_lock);

Instead of holding spin_lock_bh you need to hold rcu_read_lock_bh, so
kfree_rcu does not call free function at once (in theory ;) ).

> +       del_timer(&net->sctp.addr_wq_timer);
> +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> +               list_del_rcu(&addrw->list);
> +               kfree_rcu(addrw, rcu);
> +       }
> +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
> +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> + * the lock if it wasn't removed from addr_waitq already, otherwise we
> + * could double-free it.
> + */
> +static void sctp_free_addr_wq_entry(struct net *net,
> +                                   struct sctp_sockaddr_entry *addrw)
> +{
> +       struct sctp_sockaddr_entry *temp;
> +
> +       spin_lock_bh(&net->sctp.addr_wq_lock);

I don't think this spin_lock operation is needed. The del_timer
functions do synchronize themselves.

> +       list_for_each_entry_rcu(temp, &net->sctp.addr_waitq, list) {
> +               if (temp == addrw) {
> +                       list_del_rcu(&addrw->list);
> +                       kfree_rcu(addrw, rcu);
> +                       break;
> +               }
> +       }
> +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> +}
> +
>  static void sctp_addr_wq_timeout_handler(unsigned long arg)
>  {
>  	struct net *net = (struct net *)arg;
> -       struct sctp_sockaddr_entry *addrw, *temp;
> +       struct sctp_sockaddr_entry *addrw;
>  	struct sctp_sock *sp;
>  
> -       spin_lock_bh(&net->sctp.addr_wq_lock);
> -
> -       list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq,
> list) {
> +       rcu_read_lock_bh();
> +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
>  			 "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
>  			 addrw->state, addrw);
> @@ -647,35 +679,20 @@ static void sctp_addr_wq_timeout_handler(unsigned
> long arg)
>  #if IS_ENABLED(CONFIG_IPV6)
>  free_next:
>  #endif
> -               list_del(&addrw->list);
> -               kfree(addrw);
> -       }
> -       spin_unlock_bh(&net->sctp.addr_wq_lock);
> -}
> -
> -static void sctp_free_addr_wq(struct net *net)
> -{
> -       struct sctp_sockaddr_entry *addrw;
> -       struct sctp_sockaddr_entry *temp;
> -
> -       spin_lock_bh(&net->sctp.addr_wq_lock);
> -       del_timer(&net->sctp.addr_wq_timer);
> -       list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq,
> list) {
> -               list_del(&addrw->list);
> -               kfree(addrw);
> +               sctp_free_addr_wq_entry(net, addrw);
>  	}
> -       spin_unlock_bh(&net->sctp.addr_wq_lock);
> +       rcu_read_unlock_bh();
>  }
>  

This code looks strange to me: You rcu_read_lock_bh and walk addr_waitq
list just to pass in pointers to the sctp_free_addr_wq_entry free
function, which then walks the list again just to compare the pointer?

>  /* lookup the entry for the same address in the addr_waitq
> - * sctp_addr_wq MUST be locked
> + * rcu read MUST be locked
>   */
>  static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
>  					struct sctp_sockaddr_entry *addr)
>  {
>  	struct sctp_sockaddr_entry *addrw;
>  
> -       list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
> +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
>  		if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
>  			continue;
>  		if (addrw->a.sa.sa_family == AF_INET) {
> @@ -702,7 +719,7 @@ void sctp_addr_wq_mgmt(struct net *net, struct
> sctp_sockaddr_entry *addr, int cm
>  	 * new address after a couple of addition and deletion of that address
>  	 */
>  
> -       spin_lock_bh(&net->sctp.addr_wq_lock);
> +       rcu_read_lock_bh();
>  	/* Offsets existing events in addr_wq */
>  	addrw = sctp_addr_wq_lookup(net, addr);
>  	if (addrw) {
> @@ -710,22 +727,21 @@ void sctp_addr_wq_mgmt(struct net *net, struct
> sctp_sockaddr_entry *addr, int cm
>  			pr_debug("%s: offsets existing entry for %d, addr:%pISc "
>  				 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
>  				 &net->sctp.addr_waitq);
> -
> -                       list_del(&addrw->list);
> -                       kfree(addrw);
> +                       sctp_free_addr_wq_entry(net, addrw);
>  		}
> -               spin_unlock_bh(&net->sctp.addr_wq_lock);
> +               rcu_read_unlock_bh();
>  		return;
>  	}
> +       rcu_read_unlock_bh();
>  
>  	/* OK, we have to add the new address to the wait queue */
>  	addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> -       if (addrw == NULL) {
> -               spin_unlock_bh(&net->sctp.addr_wq_lock);
> +       if (!addrw)
>  		return;
> -       }
>  	addrw->state = cmd;
> -       list_add_tail(&addrw->list, &net->sctp.addr_waitq);
> +
> +       spin_lock_bh(&net->sctp.addr_wq_lock);
> +       list_add_tail_rcu(&addrw->list, &net->sctp.addr_waitq);

list_rcu functions can in general run concurrently without spin_locks
taken, is this one necessary?

Bye,
Hannes

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-08 14:46                         ` Hannes Frederic Sowa
@ 2015-06-08 14:59                           ` Hannes Frederic Sowa
  2015-06-08 15:19                             ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Hannes Frederic Sowa @ 2015-06-08 14:59 UTC (permalink / raw)
  To: mleitner, Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> Hi Marcelo,
> 
> a few hints on rcuification, sorry I reviewed the code so late:
> 
> On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > 
> > That's needed for the next patch, so we break the lock inversion between
> > netns_sctp->addr_wq_lock and socket lock on
> > sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> > without taking addr_wq_lock, taking it just for the write operations.
> > 
> > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > ---
> > 
> > Notes:
> >     v2->v3:
> >       placed break statement on sctp_free_addr_wq_entry()
> >       removed unnecessary spin_lock noticed by Neil
> > 
> >  include/net/netns/sctp.h |  2 +-
> >  net/sctp/protocol.c      | 80
> >  +++++++++++++++++++++++++++++-------------------
> >  2 files changed, 49 insertions(+), 33 deletions(-)
> > 
> > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > index
> > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75
> > 100644
> > --- a/include/net/netns/sctp.h
> > +++ b/include/net/netns/sctp.h
> > @@ -28,7 +28,7 @@ struct netns_sctp {
> >  	 * It is a list of sctp_sockaddr_entry.
> >  	 */
> >  	struct list_head local_addr_list;
> > -       struct list_head addr_waitq;
> > +       struct list_head __rcu addr_waitq;
> >  	struct timer_list addr_wq_timer;
> >  	struct list_head auto_asconf_splist;
> >  	spinlock_t addr_wq_lock;
> > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > index
> > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2
> > 100644
> > --- a/net/sctp/protocol.c
> > +++ b/net/sctp/protocol.c
> > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct sock *sk)
> >  	INET_ECN_xmit(sk);
> >  }
> >  
> > +static void sctp_free_addr_wq(struct net *net)
> > +{
> > +       struct sctp_sockaddr_entry *addrw;
> > +
> > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> 
> Instead of holding spin_lock_bh you need to hold rcu_read_lock_bh, so
> kfree_rcu does not call free function at once (in theory ;) ).
> 
> > +       del_timer(&net->sctp.addr_wq_timer);
> > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> > +               list_del_rcu(&addrw->list);
> > +               kfree_rcu(addrw, rcu);
> > +       }
> > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > +}
> > +
> > +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> > + * the lock if it wasn't removed from addr_waitq already, otherwise we
> > + * could double-free it.
> > + */
> > +static void sctp_free_addr_wq_entry(struct net *net,
> > +                                   struct sctp_sockaddr_entry *addrw)
> > +{
> > +       struct sctp_sockaddr_entry *temp;
> > +
> > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> 
> I don't think this spin_lock operation is needed. The del_timer
> functions do synchronize themselves.
> 

Sorry, those above two locks are needed, they are not implied by other
locks.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-08 14:59                           ` Hannes Frederic Sowa
@ 2015-06-08 15:19                             ` Neil Horman
  2015-06-08 15:37                               ` Hannes Frederic Sowa
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-06-08 15:19 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: mleitner, netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich,
	Michio Honda

On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > Hi Marcelo,
> > 
> > a few hints on rcuification, sorry I reviewed the code so late:
> > 
> > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > 
> > > That's needed for the next patch, so we break the lock inversion between
> > > netns_sctp->addr_wq_lock and socket lock on
> > > sctp_addr_wq_timeout_handler(). With this, we can traverse addr_waitq
> > > without taking addr_wq_lock, taking it just for the write operations.
> > > 
> > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > ---
> > > 
> > > Notes:
> > >     v2->v3:
> > >       placed break statement on sctp_free_addr_wq_entry()
> > >       removed unnecessary spin_lock noticed by Neil
> > > 
> > >  include/net/netns/sctp.h |  2 +-
> > >  net/sctp/protocol.c      | 80
> > >  +++++++++++++++++++++++++++++-------------------
> > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > 
> > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > index
> > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e45777a6d95406d490dbaa75
> > > 100644
> > > --- a/include/net/netns/sctp.h
> > > +++ b/include/net/netns/sctp.h
> > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > >  	 * It is a list of sctp_sockaddr_entry.
> > >  	 */
> > >  	struct list_head local_addr_list;
> > > -       struct list_head addr_waitq;
> > > +       struct list_head __rcu addr_waitq;
> > >  	struct timer_list addr_wq_timer;
> > >  	struct list_head auto_asconf_splist;
> > >  	spinlock_t addr_wq_lock;
> > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > index
> > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2
> > > 100644
> > > --- a/net/sctp/protocol.c
> > > +++ b/net/sctp/protocol.c
> > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct sock *sk)
> > >  	INET_ECN_xmit(sk);
> > >  }
> > >  
> > > +static void sctp_free_addr_wq(struct net *net)
> > > +{
> > > +       struct sctp_sockaddr_entry *addrw;
> > > +
> > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > 
> > Instead of holding spin_lock_bh you need to hold rcu_read_lock_bh, so
> > kfree_rcu does not call free function at once (in theory ;) ).
> > 
> > > +       del_timer(&net->sctp.addr_wq_timer);
> > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, list) {
> > > +               list_del_rcu(&addrw->list);
> > > +               kfree_rcu(addrw, rcu);
> > > +       }
> > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > +}
> > > +
> > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check inside
> > > + * the lock if it wasn't removed from addr_waitq already, otherwise we
> > > + * could double-free it.
> > > + */
> > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > +                                   struct sctp_sockaddr_entry *addrw)
> > > +{
> > > +       struct sctp_sockaddr_entry *temp;
> > > +
> > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > 
> > I don't think this spin_lock operation is needed. The del_timer
> > functions do synchronize themselves.
> > 
> 
> Sorry, those above two locks are needed, they are not implied by other
> locks.
> 
What makes you say that? Multiple contexts can issue mod_timer calls on the
same timer safely no, because of the internal locking?

Neil

> Bye,
> Hannes
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-08 15:19                             ` Neil Horman
@ 2015-06-08 15:37                               ` Hannes Frederic Sowa
  2015-06-09 11:36                                 ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Hannes Frederic Sowa @ 2015-06-08 15:37 UTC (permalink / raw)
  To: Neil Horman
  Cc: mleitner, netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich,
	Michio Honda

On Mo, 2015-06-08 at 11:19 -0400, Neil Horman wrote:
> On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> > On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > > Hi Marcelo,
> > > 
> > > a few hints on rcuification, sorry I reviewed the code so late:
> > > 
> > > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > 
> > > > That's needed for the next patch, so we break the lock 
> > > > inversion between
> > > > netns_sctp->addr_wq_lock and socket lock on
> > > > sctp_addr_wq_timeout_handler(). With this, we can traverse 
> > > > addr_waitq
> > > > without taking addr_wq_lock, taking it just for the write 
> > > > operations.
> > > > 
> > > > Signed-off-by: Marcelo Ricardo Leitner <
> > > > marcelo.leitner@gmail.com>
> > > > ---
> > > > 
> > > > Notes:
> > > >     v2->v3:
> > > >       placed break statement on sctp_free_addr_wq_entry()
> > > >       removed unnecessary spin_lock noticed by Neil
> > > > 
> > > >  include/net/netns/sctp.h |  2 +-
> > > >  net/sctp/protocol.c      | 80
> > > >  +++++++++++++++++++++++++++++-------------------
> > > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > > 
> > > > diff --git a/include/net/netns/sctp.h 
> > > > b/include/net/netns/sctp.h
> > > > index
> > > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e4577
> > > > 7a6d95406d490dbaa75
> > > > 100644
> > > > --- a/include/net/netns/sctp.h
> > > > +++ b/include/net/netns/sctp.h
> > > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > > >          * It is a list of sctp_sockaddr_entry.
> > > >          */
> > > >         struct list_head local_addr_list;
> > > > -       struct list_head addr_waitq;
> > > > +       struct list_head __rcu addr_waitq;
> > > >         struct timer_list addr_wq_timer;
> > > >         struct list_head auto_asconf_splist;
> > > >         spinlock_t addr_wq_lock;
> > > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > > index
> > > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a6
> > > > 27e2d7f9a1fef861fc2
> > > > 100644
> > > > --- a/net/sctp/protocol.c
> > > > +++ b/net/sctp/protocol.c
> > > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct 
> > > > sock *sk)
> > > >         INET_ECN_xmit(sk);
> > > >  }
> > > >  
> > > > +static void sctp_free_addr_wq(struct net *net)
> > > > +{
> > > > +       struct sctp_sockaddr_entry *addrw;
> > > > +
> > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > 
> > > Instead of holding spin_lock_bh you need to hold 
> > > rcu_read_lock_bh, so
> > > kfree_rcu does not call free function at once (in theory ;) ).
> > > 
> > > > +       del_timer(&net->sctp.addr_wq_timer);
> > > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, 
> > > > list) {
> > > > +               list_del_rcu(&addrw->list);
> > > > +               kfree_rcu(addrw, rcu);
> > > > +       }
> > > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > > +}
> > > > +
> > > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check 
> > > > inside
> > > > + * the lock if it wasn't removed from addr_waitq already, 
> > > > otherwise we
> > > > + * could double-free it.
> > > > + */
> > > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > > +                                   struct sctp_sockaddr_entry 
> > > > *addrw)
> > > > +{
> > > > +       struct sctp_sockaddr_entry *temp;
> > > > +
> > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > 
> > > I don't think this spin_lock operation is needed. The del_timer
> > > functions do synchronize themselves.
> > > 
> > 
> > Sorry, those above two locks are needed, they are not implied by 
> > other
> > locks.
> > 
> What makes you say that? Multiple contexts can issue mod_timer calls 
> on the
> same timer safely no, because of the internal locking?

That's true for timer handling but not to protect net->sctp.addr_waitq
list (Marcelo just explained it to me off-list). Looking at the patch
only in patchworks lost quite a lot of context you were already
discussing. ;)

We are currently checking if the double iteration can be avoided by
splicing addr_waitq on the local stack while holding the spin_lock and
later on notifying the sockets.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-08 15:37                               ` Hannes Frederic Sowa
@ 2015-06-09 11:36                                 ` Neil Horman
  2015-06-09 19:32                                   ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-06-09 11:36 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: mleitner, netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich,
	Michio Honda

On Mon, Jun 08, 2015 at 05:37:05PM +0200, Hannes Frederic Sowa wrote:
> On Mo, 2015-06-08 at 11:19 -0400, Neil Horman wrote:
> > On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> > > On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > > > Hi Marcelo,
> > > > 
> > > > a few hints on rcuification, sorry I reviewed the code so late:
> > > > 
> > > > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > 
> > > > > That's needed for the next patch, so we break the lock 
> > > > > inversion between
> > > > > netns_sctp->addr_wq_lock and socket lock on
> > > > > sctp_addr_wq_timeout_handler(). With this, we can traverse 
> > > > > addr_waitq
> > > > > without taking addr_wq_lock, taking it just for the write 
> > > > > operations.
> > > > > 
> > > > > Signed-off-by: Marcelo Ricardo Leitner <
> > > > > marcelo.leitner@gmail.com>
> > > > > ---
> > > > > 
> > > > > Notes:
> > > > >     v2->v3:
> > > > >       placed break statement on sctp_free_addr_wq_entry()
> > > > >       removed unnecessary spin_lock noticed by Neil
> > > > > 
> > > > >  include/net/netns/sctp.h |  2 +-
> > > > >  net/sctp/protocol.c      | 80
> > > > >  +++++++++++++++++++++++++++++-------------------
> > > > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/netns/sctp.h 
> > > > > b/include/net/netns/sctp.h
> > > > > index
> > > > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e4577
> > > > > 7a6d95406d490dbaa75
> > > > > 100644
> > > > > --- a/include/net/netns/sctp.h
> > > > > +++ b/include/net/netns/sctp.h
> > > > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > > > >          * It is a list of sctp_sockaddr_entry.
> > > > >          */
> > > > >         struct list_head local_addr_list;
> > > > > -       struct list_head addr_waitq;
> > > > > +       struct list_head __rcu addr_waitq;
> > > > >         struct timer_list addr_wq_timer;
> > > > >         struct list_head auto_asconf_splist;
> > > > >         spinlock_t addr_wq_lock;
> > > > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > > > index
> > > > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a6
> > > > > 27e2d7f9a1fef861fc2
> > > > > 100644
> > > > > --- a/net/sctp/protocol.c
> > > > > +++ b/net/sctp/protocol.c
> > > > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct 
> > > > > sock *sk)
> > > > >         INET_ECN_xmit(sk);
> > > > >  }
> > > > >  
> > > > > +static void sctp_free_addr_wq(struct net *net)
> > > > > +{
> > > > > +       struct sctp_sockaddr_entry *addrw;
> > > > > +
> > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > 
> > > > Instead of holding spin_lock_bh you need to hold 
> > > > rcu_read_lock_bh, so
> > > > kfree_rcu does not call free function at once (in theory ;) ).
> > > > 
> > > > > +       del_timer(&net->sctp.addr_wq_timer);
> > > > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, 
> > > > > list) {
> > > > > +               list_del_rcu(&addrw->list);
> > > > > +               kfree_rcu(addrw, rcu);
> > > > > +       }
> > > > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > > > +}
> > > > > +
> > > > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check 
> > > > > inside
> > > > > + * the lock if it wasn't removed from addr_waitq already, 
> > > > > otherwise we
> > > > > + * could double-free it.
> > > > > + */
> > > > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > > > +                                   struct sctp_sockaddr_entry 
> > > > > *addrw)
> > > > > +{
> > > > > +       struct sctp_sockaddr_entry *temp;
> > > > > +
> > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > 
> > > > I don't think this spin_lock operation is needed. The del_timer
> > > > functions do synchronize themselves.
> > > > 
> > > 
> > > Sorry, those above two locks are needed, they are not implied by 
> > > other
> > > locks.
> > > 
> > What makes you say that? Multiple contexts can issue mod_timer calls 
> > on the
> > same timer safely no, because of the internal locking?
> 
> That's true for timer handling but not to protect net->sctp.addr_waitq
> list (Marcelo just explained it to me off-list). Looking at the patch
> only in patchworks lost quite a lot of context you were already
> discussing. ;)
> 
I can imagine :)

> We are currently checking if the double iteration can be avoided by
> splicing addr_waitq on the local stack while holding the spin_lock and
> later on notifying the sockets.
> 
As we discussed, this I think would make a good alternate approach.
Neil

> Bye,
> Hannes
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-09 11:36                                 ` Neil Horman
@ 2015-06-09 19:32                                   ` Marcelo Ricardo Leitner
  2015-06-10 13:31                                     ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-09 19:32 UTC (permalink / raw)
  To: Neil Horman
  Cc: Hannes Frederic Sowa, netdev, linux-sctp, Daniel Borkmann,
	Vlad Yasevich, Michio Honda

On Tue, Jun 09, 2015 at 07:36:38AM -0400, Neil Horman wrote:
> On Mon, Jun 08, 2015 at 05:37:05PM +0200, Hannes Frederic Sowa wrote:
> > On Mo, 2015-06-08 at 11:19 -0400, Neil Horman wrote:
> > > On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> > > > On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > > > > Hi Marcelo,
> > > > > 
> > > > > a few hints on rcuification, sorry I reviewed the code so late:
> > > > > 
> > > > > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > 
> > > > > > That's needed for the next patch, so we break the lock 
> > > > > > inversion between
> > > > > > netns_sctp->addr_wq_lock and socket lock on
> > > > > > sctp_addr_wq_timeout_handler(). With this, we can traverse 
> > > > > > addr_waitq
> > > > > > without taking addr_wq_lock, taking it just for the write 
> > > > > > operations.
> > > > > > 
> > > > > > Signed-off-by: Marcelo Ricardo Leitner <
> > > > > > marcelo.leitner@gmail.com>
> > > > > > ---
> > > > > > 
> > > > > > Notes:
> > > > > >     v2->v3:
> > > > > >       placed break statement on sctp_free_addr_wq_entry()
> > > > > >       removed unnecessary spin_lock noticed by Neil
> > > > > > 
> > > > > >  include/net/netns/sctp.h |  2 +-
> > > > > >  net/sctp/protocol.c      | 80
> > > > > >  +++++++++++++++++++++++++++++-------------------
> > > > > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/net/netns/sctp.h 
> > > > > > b/include/net/netns/sctp.h
> > > > > > index
> > > > > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e4577
> > > > > > 7a6d95406d490dbaa75
> > > > > > 100644
> > > > > > --- a/include/net/netns/sctp.h
> > > > > > +++ b/include/net/netns/sctp.h
> > > > > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > > > > >          * It is a list of sctp_sockaddr_entry.
> > > > > >          */
> > > > > >         struct list_head local_addr_list;
> > > > > > -       struct list_head addr_waitq;
> > > > > > +       struct list_head __rcu addr_waitq;
> > > > > >         struct timer_list addr_wq_timer;
> > > > > >         struct list_head auto_asconf_splist;
> > > > > >         spinlock_t addr_wq_lock;
> > > > > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > > > > index
> > > > > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a6
> > > > > > 27e2d7f9a1fef861fc2
> > > > > > 100644
> > > > > > --- a/net/sctp/protocol.c
> > > > > > +++ b/net/sctp/protocol.c
> > > > > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct 
> > > > > > sock *sk)
> > > > > >         INET_ECN_xmit(sk);
> > > > > >  }
> > > > > >  
> > > > > > +static void sctp_free_addr_wq(struct net *net)
> > > > > > +{
> > > > > > +       struct sctp_sockaddr_entry *addrw;
> > > > > > +
> > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > 
> > > > > Instead of holding spin_lock_bh you need to hold 
> > > > > rcu_read_lock_bh, so
> > > > > kfree_rcu does not call free function at once (in theory ;) ).
> > > > > 
> > > > > > +       del_timer(&net->sctp.addr_wq_timer);
> > > > > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, 
> > > > > > list) {
> > > > > > +               list_del_rcu(&addrw->list);
> > > > > > +               kfree_rcu(addrw, rcu);
> > > > > > +       }
> > > > > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > > > > +}
> > > > > > +
> > > > > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check 
> > > > > > inside
> > > > > > + * the lock if it wasn't removed from addr_waitq already, 
> > > > > > otherwise we
> > > > > > + * could double-free it.
> > > > > > + */
> > > > > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > > > > +                                   struct sctp_sockaddr_entry 
> > > > > > *addrw)
> > > > > > +{
> > > > > > +       struct sctp_sockaddr_entry *temp;
> > > > > > +
> > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > 
> > > > > I don't think this spin_lock operation is needed. The del_timer
> > > > > functions do synchronize themselves.
> > > > > 
> > > > 
> > > > Sorry, those above two locks are needed, they are not implied by 
> > > > other
> > > > locks.
> > > > 
> > > What makes you say that? Multiple contexts can issue mod_timer calls 
> > > on the
> > > same timer safely no, because of the internal locking?
> > 
> > That's true for timer handling but not to protect net->sctp.addr_waitq
> > list (Marcelo just explained it to me off-list). Looking at the patch
> > only in patchworks lost quite a lot of context you were already
> > discussing. ;)
> > 
> I can imagine :)
> 
> > We are currently checking if the double iteration can be avoided by
> > splicing addr_waitq on the local stack while holding the spin_lock and
> > later on notifying the sockets.
> > 
> As we discussed, this I think would make a good alternate approach.

I was experimenting on this but this would introduce another complex
logic instead, as not all elements are pruned from net->sctp.addr_waitq
at sctp_addr_wq_timeout_handler(), mainly ipv6 addresses in DAD state
(which I believe that break statement is misplaced and should be a
continue instead, I'll check on this later)

That means we would have to do the splice, process the loop, merge the
remaining elements with the new net->sctp.addr_waitq that was possibly
was built meanwhile and then squash oppositve events (logic currently in
sctp_addr_wq_mgmt() ), otherwise we could be issuing spurious events.

But it will probably do more harm than good as the double search will
usually hit the first list element on this 2nd search, unless the
element we are trying to remove was already removed from it (which is
rare, it's when user add and remove addresses too fast) or some other
address was skipped (DAD addresses).

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-09 19:32                                   ` Marcelo Ricardo Leitner
@ 2015-06-10 13:31                                     ` Marcelo Ricardo Leitner
  2015-06-10 19:14                                       ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-10 13:31 UTC (permalink / raw)
  To: Neil Horman
  Cc: Hannes Frederic Sowa, netdev, linux-sctp, Daniel Borkmann,
	Vlad Yasevich, Michio Honda

On Tue, Jun 09, 2015 at 04:32:59PM -0300, Marcelo Ricardo Leitner wrote:
> On Tue, Jun 09, 2015 at 07:36:38AM -0400, Neil Horman wrote:
> > On Mon, Jun 08, 2015 at 05:37:05PM +0200, Hannes Frederic Sowa wrote:
> > > On Mo, 2015-06-08 at 11:19 -0400, Neil Horman wrote:
> > > > On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> > > > > On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > > > > > Hi Marcelo,
> > > > > > 
> > > > > > a few hints on rcuification, sorry I reviewed the code so late:
> > > > > > 
> > > > > > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > 
> > > > > > > That's needed for the next patch, so we break the lock 
> > > > > > > inversion between
> > > > > > > netns_sctp->addr_wq_lock and socket lock on
> > > > > > > sctp_addr_wq_timeout_handler(). With this, we can traverse 
> > > > > > > addr_waitq
> > > > > > > without taking addr_wq_lock, taking it just for the write 
> > > > > > > operations.
> > > > > > > 
> > > > > > > Signed-off-by: Marcelo Ricardo Leitner <
> > > > > > > marcelo.leitner@gmail.com>
> > > > > > > ---
> > > > > > > 
> > > > > > > Notes:
> > > > > > >     v2->v3:
> > > > > > >       placed break statement on sctp_free_addr_wq_entry()
> > > > > > >       removed unnecessary spin_lock noticed by Neil
> > > > > > > 
> > > > > > >  include/net/netns/sctp.h |  2 +-
> > > > > > >  net/sctp/protocol.c      | 80
> > > > > > >  +++++++++++++++++++++++++++++-------------------
> > > > > > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/include/net/netns/sctp.h 
> > > > > > > b/include/net/netns/sctp.h
> > > > > > > index
> > > > > > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e4577
> > > > > > > 7a6d95406d490dbaa75
> > > > > > > 100644
> > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > > > > > >          * It is a list of sctp_sockaddr_entry.
> > > > > > >          */
> > > > > > >         struct list_head local_addr_list;
> > > > > > > -       struct list_head addr_waitq;
> > > > > > > +       struct list_head __rcu addr_waitq;
> > > > > > >         struct timer_list addr_wq_timer;
> > > > > > >         struct list_head auto_asconf_splist;
> > > > > > >         spinlock_t addr_wq_lock;
> > > > > > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > > > > > index
> > > > > > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a6
> > > > > > > 27e2d7f9a1fef861fc2
> > > > > > > 100644
> > > > > > > --- a/net/sctp/protocol.c
> > > > > > > +++ b/net/sctp/protocol.c
> > > > > > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct 
> > > > > > > sock *sk)
> > > > > > >         INET_ECN_xmit(sk);
> > > > > > >  }
> > > > > > >  
> > > > > > > +static void sctp_free_addr_wq(struct net *net)
> > > > > > > +{
> > > > > > > +       struct sctp_sockaddr_entry *addrw;
> > > > > > > +
> > > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > > 
> > > > > > Instead of holding spin_lock_bh you need to hold 
> > > > > > rcu_read_lock_bh, so
> > > > > > kfree_rcu does not call free function at once (in theory ;) ).
> > > > > > 
> > > > > > > +       del_timer(&net->sctp.addr_wq_timer);
> > > > > > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, 
> > > > > > > list) {
> > > > > > > +               list_del_rcu(&addrw->list);
> > > > > > > +               kfree_rcu(addrw, rcu);
> > > > > > > +       }
> > > > > > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check 
> > > > > > > inside
> > > > > > > + * the lock if it wasn't removed from addr_waitq already, 
> > > > > > > otherwise we
> > > > > > > + * could double-free it.
> > > > > > > + */
> > > > > > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > > > > > +                                   struct sctp_sockaddr_entry 
> > > > > > > *addrw)
> > > > > > > +{
> > > > > > > +       struct sctp_sockaddr_entry *temp;
> > > > > > > +
> > > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > > 
> > > > > > I don't think this spin_lock operation is needed. The del_timer
> > > > > > functions do synchronize themselves.
> > > > > > 
> > > > > 
> > > > > Sorry, those above two locks are needed, they are not implied by 
> > > > > other
> > > > > locks.
> > > > > 
> > > > What makes you say that? Multiple contexts can issue mod_timer calls 
> > > > on the
> > > > same timer safely no, because of the internal locking?
> > > 
> > > That's true for timer handling but not to protect net->sctp.addr_waitq
> > > list (Marcelo just explained it to me off-list). Looking at the patch
> > > only in patchworks lost quite a lot of context you were already
> > > discussing. ;)
> > > 
> > I can imagine :)
> > 
> > > We are currently checking if the double iteration can be avoided by
> > > splicing addr_waitq on the local stack while holding the spin_lock and
> > > later on notifying the sockets.
> > > 
> > As we discussed, this I think would make a good alternate approach.
> 
> I was experimenting on this but this would introduce another complex
> logic instead, as not all elements are pruned from net->sctp.addr_waitq
> at sctp_addr_wq_timeout_handler(), mainly ipv6 addresses in DAD state
> (which I believe that break statement is misplaced and should be a
> continue instead, I'll check on this later)
> 
> That means we would have to do the splice, process the loop, merge the
> remaining elements with the new net->sctp.addr_waitq that was possibly
> was built meanwhile and then squash oppositve events (logic currently in
> sctp_addr_wq_mgmt() ), otherwise we could be issuing spurious events.
> 
> But it will probably do more harm than good as the double search will
> usually hit the first list element on this 2nd search, unless the
> element we are trying to remove was already removed from it (which is
> rare, it's when user add and remove addresses too fast) or some other
> address was skipped (DAD addresses).

Better thinking.. actually it may be the way to go. If we rcu-cify
addr_waitq like that and if the user manage to add an address and remove
it while the timeout handler is running, the system may emit just the
address add and not the remove, while if we splice the list, this won't
happen. 

  Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/2] sctp: rcu-ify addr_waitq
  2015-06-10 13:31                                     ` Marcelo Ricardo Leitner
@ 2015-06-10 19:14                                       ` Neil Horman
  2015-06-11 14:30                                         ` [PATCH v4] sctp: fix ASCONF list handling mleitner
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2015-06-10 19:14 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Hannes Frederic Sowa, netdev, linux-sctp, Daniel Borkmann,
	Vlad Yasevich, Michio Honda

On Wed, Jun 10, 2015 at 10:31:42AM -0300, Marcelo Ricardo Leitner wrote:
> On Tue, Jun 09, 2015 at 04:32:59PM -0300, Marcelo Ricardo Leitner wrote:
> > On Tue, Jun 09, 2015 at 07:36:38AM -0400, Neil Horman wrote:
> > > On Mon, Jun 08, 2015 at 05:37:05PM +0200, Hannes Frederic Sowa wrote:
> > > > On Mo, 2015-06-08 at 11:19 -0400, Neil Horman wrote:
> > > > > On Mon, Jun 08, 2015 at 04:59:18PM +0200, Hannes Frederic Sowa wrote:
> > > > > > On Mon, Jun 8, 2015, at 16:46, Hannes Frederic Sowa wrote:
> > > > > > > Hi Marcelo,
> > > > > > > 
> > > > > > > a few hints on rcuification, sorry I reviewed the code so late:
> > > > > > > 
> > > > > > > On Fri, Jun 5, 2015, at 19:08, mleitner@redhat.com wrote:
> > > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > 
> > > > > > > > That's needed for the next patch, so we break the lock 
> > > > > > > > inversion between
> > > > > > > > netns_sctp->addr_wq_lock and socket lock on
> > > > > > > > sctp_addr_wq_timeout_handler(). With this, we can traverse 
> > > > > > > > addr_waitq
> > > > > > > > without taking addr_wq_lock, taking it just for the write 
> > > > > > > > operations.
> > > > > > > > 
> > > > > > > > Signed-off-by: Marcelo Ricardo Leitner <
> > > > > > > > marcelo.leitner@gmail.com>
> > > > > > > > ---
> > > > > > > > 
> > > > > > > > Notes:
> > > > > > > >     v2->v3:
> > > > > > > >       placed break statement on sctp_free_addr_wq_entry()
> > > > > > > >       removed unnecessary spin_lock noticed by Neil
> > > > > > > > 
> > > > > > > >  include/net/netns/sctp.h |  2 +-
> > > > > > > >  net/sctp/protocol.c      | 80
> > > > > > > >  +++++++++++++++++++++++++++++-------------------
> > > > > > > >  2 files changed, 49 insertions(+), 33 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/include/net/netns/sctp.h 
> > > > > > > > b/include/net/netns/sctp.h
> > > > > > > > index
> > > > > > > > 3573a81815ad9e0efb6ceb721eb066d3726419f0..9e53412c4ed829e8e4577
> > > > > > > > 7a6d95406d490dbaa75
> > > > > > > > 100644
> > > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > > @@ -28,7 +28,7 @@ struct netns_sctp {
> > > > > > > >          * It is a list of sctp_sockaddr_entry.
> > > > > > > >          */
> > > > > > > >         struct list_head local_addr_list;
> > > > > > > > -       struct list_head addr_waitq;
> > > > > > > > +       struct list_head __rcu addr_waitq;
> > > > > > > >         struct timer_list addr_wq_timer;
> > > > > > > >         struct list_head auto_asconf_splist;
> > > > > > > >         spinlock_t addr_wq_lock;
> > > > > > > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > > > > > > index
> > > > > > > > 53b7acde9aa37bf3d4029c459421564d5270f4c0..9954fb8c9a9455d5ad7a6
> > > > > > > > 27e2d7f9a1fef861fc2
> > > > > > > > 100644
> > > > > > > > --- a/net/sctp/protocol.c
> > > > > > > > +++ b/net/sctp/protocol.c
> > > > > > > > @@ -593,15 +593,47 @@ static void sctp_v4_ecn_capable(struct 
> > > > > > > > sock *sk)
> > > > > > > >         INET_ECN_xmit(sk);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +static void sctp_free_addr_wq(struct net *net)
> > > > > > > > +{
> > > > > > > > +       struct sctp_sockaddr_entry *addrw;
> > > > > > > > +
> > > > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > > > 
> > > > > > > Instead of holding spin_lock_bh you need to hold 
> > > > > > > rcu_read_lock_bh, so
> > > > > > > kfree_rcu does not call free function at once (in theory ;) ).
> > > > > > > 
> > > > > > > > +       del_timer(&net->sctp.addr_wq_timer);
> > > > > > > > +       list_for_each_entry_rcu(addrw, &net->sctp.addr_waitq, 
> > > > > > > > list) {
> > > > > > > > +               list_del_rcu(&addrw->list);
> > > > > > > > +               kfree_rcu(addrw, rcu);
> > > > > > > > +       }
> > > > > > > > +       spin_unlock_bh(&net->sctp.addr_wq_lock);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/* As there is no refcnt on sctp_sockaddr_entry, we must check 
> > > > > > > > inside
> > > > > > > > + * the lock if it wasn't removed from addr_waitq already, 
> > > > > > > > otherwise we
> > > > > > > > + * could double-free it.
> > > > > > > > + */
> > > > > > > > +static void sctp_free_addr_wq_entry(struct net *net,
> > > > > > > > +                                   struct sctp_sockaddr_entry 
> > > > > > > > *addrw)
> > > > > > > > +{
> > > > > > > > +       struct sctp_sockaddr_entry *temp;
> > > > > > > > +
> > > > > > > > +       spin_lock_bh(&net->sctp.addr_wq_lock);
> > > > > > > 
> > > > > > > I don't think this spin_lock operation is needed. The del_timer
> > > > > > > functions do synchronize themselves.
> > > > > > > 
> > > > > > 
> > > > > > Sorry, those above two locks are needed, they are not implied by 
> > > > > > other
> > > > > > locks.
> > > > > > 
> > > > > What makes you say that? Multiple contexts can issue mod_timer calls 
> > > > > on the
> > > > > same timer safely no, because of the internal locking?
> > > > 
> > > > That's true for timer handling but not to protect net->sctp.addr_waitq
> > > > list (Marcelo just explained it to me off-list). Looking at the patch
> > > > only in patchworks lost quite a lot of context you were already
> > > > discussing. ;)
> > > > 
> > > I can imagine :)
> > > 
> > > > We are currently checking if the double iteration can be avoided by
> > > > splicing addr_waitq on the local stack while holding the spin_lock and
> > > > later on notifying the sockets.
> > > > 
> > > As we discussed, this I think would make a good alternate approach.
> > 
> > I was experimenting on this but this would introduce another complex
> > logic instead, as not all elements are pruned from net->sctp.addr_waitq
> > at sctp_addr_wq_timeout_handler(), mainly ipv6 addresses in DAD state
> > (which I believe that break statement is misplaced and should be a
> > continue instead, I'll check on this later)
> > 
> > That means we would have to do the splice, process the loop, merge the
> > remaining elements with the new net->sctp.addr_waitq that was possibly
> > was built meanwhile and then squash oppositve events (logic currently in
> > sctp_addr_wq_mgmt() ), otherwise we could be issuing spurious events.
> > 
> > But it will probably do more harm than good as the double search will
> > usually hit the first list element on this 2nd search, unless the
> > element we are trying to remove was already removed from it (which is
> > rare, it's when user add and remove addresses too fast) or some other
> > address was skipped (DAD addresses).
> 
> Better thinking.. actually it may be the way to go. If we rcu-cify
> addr_waitq like that and if the user manage to add an address and remove
> it while the timeout handler is running, the system may emit just the
> address add and not the remove, while if we splice the list, this won't
> happen. 
> 
>   Marcelo
> 
> 
Thats a good point.  Seems like a list splice makes more sense here.
Neil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v4] sctp: fix ASCONF list handling
  2015-06-10 19:14                                       ` Neil Horman
@ 2015-06-11 14:30                                         ` mleitner
  2015-06-11 23:31                                           ` David Miller
  0 siblings, 1 reply; 34+ messages in thread
From: mleitner @ 2015-06-11 14:30 UTC (permalink / raw)
  To: Neil Horman, Hannes Frederic Sowa, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by using ->addr_wq_lock
spinlock to protect the list. A special handling is done upon socket
creation and destruction for that. Error handlig on sctp_init_sock()
will never return an error after having initialized asconf, so
sctp_destroy_sock() can be called without addrq_wq_lock. The lock now
will be take on sctp_close_sock(), before locking the socket, so we
don't do it in inverse order compared to sctp_addr_wq_timeout_handler().

Instead of taking the lock on sctp_sock_migrate() for copying and
restoring the list values, it's preferred to avoid rewritting it by
implementing sctp_copy_descendant().

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off, but one could trigger it by issuing
simultaneous setsockopt() calls on multiple sockets or by
creating/destroying sockets fast enough. This is only triggerable
locally.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Reported-by: Ji Jianwen <jiji@redhat.com>
Suggested-by: Neil Horman <nhorman@tuxdriver.com>
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Notes:
    Attempts to circumvent this lock invertion with RCU and/or list splicing
    were unsuccessful, as they led to more and more code to handle it
    properly.
    
    Back when Hannes started reviewing the patches, he had asked if I
    couldn't take the lock earlier during the socket destruction. I had said
    no because sctp_destroy_sock() is called with socket lock already held
    on sctp_close_sock() and such would not be possible to handle on error
    handling situations like when sctp_init_sock() fails and
    sctp_destroy_sock() is called right after that.
    
    But if we take care that nothing fails after initializing asconf on
    sctp_init_sock(), this is possible, and less complicated than my RCU and
    list splicing attempts.

 include/net/netns/sctp.h   |  1 +
 include/net/sctp/structs.h |  4 ++++
 net/sctp/socket.c          | 38 ++++++++++++++++++++++++++++++--------
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..8ba379f9e4678d7f00209a6b2ac12d41d82f4b25 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -31,6 +31,7 @@ struct netns_sctp {
 	struct list_head addr_waitq;
 	struct timer_list addr_wq_timer;
 	struct list_head auto_asconf_splist;
+	/* Lock that protects both addr_waitq and auto_asconf_splist */
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..495c87e367b3f2e8941807f56a77d2e14469bfed 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,10 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations
+	 */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..9af02e777944552f3035ce499a929766119c0e9f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1528,7 +1528,10 @@ static void sctp_close(struct sock *sk, long timeout)
 
 	/* Supposedly, no process has access to the socket, but
 	 * the net layers still may.
+	 * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+	 * held and that should be grabbed before socket lock.
 	 */
+	spin_lock_bh(&net->sctp.addr_wq_lock);
 	local_bh_disable();
 	bh_lock_sock(sk);
 
@@ -1540,6 +1543,7 @@ static void sctp_close(struct sock *sk, long timeout)
 
 	bh_unlock_sock(sk);
 	local_bh_enable();
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
 
 	sock_put(sk);
 
@@ -3580,6 +3584,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	if (val == 0 && sp->do_auto_asconf) {
 		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
@@ -3588,6 +3593,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
 	}
+	spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	return 0;
 }
 
@@ -4121,18 +4127,27 @@ static int sctp_init_sock(struct sock *sk)
 	local_bh_disable();
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
+
+	/* Nothing can fail after this block, otherwise
+	 * sctp_destroy_sock() will be called without addr_wq_lock held
+	 */
 	if (net->sctp.default_auto_asconf) {
+		spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 		list_add_tail(&sp->auto_asconf_list,
 		    &net->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
+		spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	} else
 		sp->do_auto_asconf = 0;
+
 	local_bh_enable();
 
 	return 0;
 }
 
-/* Cleanup any SCTP per socket resources.  */
+/* Cleanup any SCTP per socket resources. Must be called with
+ * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true
+ */
 static void sctp_destroy_sock(struct sock *sk)
 {
 	struct sctp_sock *sp;
@@ -7195,6 +7210,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7237,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7244,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v4] sctp: fix ASCONF list handling
  2015-06-11 14:30                                         ` [PATCH v4] sctp: fix ASCONF list handling mleitner
@ 2015-06-11 23:31                                           ` David Miller
  2015-06-12 13:16                                             ` [PATCH v5] " marcelo.leitner
  0 siblings, 1 reply; 34+ messages in thread
From: David Miller @ 2015-06-11 23:31 UTC (permalink / raw)
  To: mleitner; +Cc: nhorman, hannes, netdev, linux-sctp, daniel, vyasevich, micchie

From: mleitner@redhat.com
Date: Thu, 11 Jun 2015 11:30:46 -0300

>     Attempts to circumvent this lock invertion with RCU and/or list splicing
>     were unsuccessful, as they led to more and more code to handle it
>     properly.
>     
>     Back when Hannes started reviewing the patches, he had asked if I
>     couldn't take the lock earlier during the socket destruction. I had said
>     no because sctp_destroy_sock() is called with socket lock already held
>     on sctp_close_sock() and such would not be possible to handle on error
>     handling situations like when sctp_init_sock() fails and
>     sctp_destroy_sock() is called right after that.
>     
>     But if we take care that nothing fails after initializing asconf on
>     sctp_init_sock(), this is possible, and less complicated than my RCU and
>     list splicing attempts.

This is definitely a cleaner/simpler fix, but:

> @@ -1528,7 +1528,10 @@ static void sctp_close(struct sock *sk, long timeout)
>  
>  	/* Supposedly, no process has access to the socket, but
>  	 * the net layers still may.
> +	 * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
> +	 * held and that should be grabbed before socket lock.
>  	 */
> +	spin_lock_bh(&net->sctp.addr_wq_lock);
>  	local_bh_disable();
>  	bh_lock_sock(sk);
>  
> @@ -1540,6 +1543,7 @@ static void sctp_close(struct sock *sk, long timeout)
>  
>  	bh_unlock_sock(sk);
>  	local_bh_enable();
> +	spin_unlock_bh(&net->sctp.addr_wq_lock);
>  
>  	sock_put(sk);
>  

The local_bh_{enable,disable}() now appear to be superfluous and thus
can be removed.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v5] sctp: fix ASCONF list handling
  2015-06-11 23:31                                           ` David Miller
@ 2015-06-12 13:16                                             ` marcelo.leitner
  2015-06-14 19:56                                               ` David Miller
  0 siblings, 1 reply; 34+ messages in thread
From: marcelo.leitner @ 2015-06-12 13:16 UTC (permalink / raw)
  To: Neil Horman, Hannes Frederic Sowa, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by using ->addr_wq_lock
spinlock to protect the list. A special handling is done upon socket
creation and destruction for that. Error handlig on sctp_init_sock()
will never return an error after having initialized asconf, so
sctp_destroy_sock() can be called without addrq_wq_lock. The lock now
will be take on sctp_close_sock(), before locking the socket, so we
don't do it in inverse order compared to sctp_addr_wq_timeout_handler().

Instead of taking the lock on sctp_sock_migrate() for copying and
restoring the list values, it's preferred to avoid rewritting it by
implementing sctp_copy_descendant().

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off, but one could trigger it by issuing
simultaneous setsockopt() calls on multiple sockets or by
creating/destroying sockets fast enough. This is only triggerable
locally.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Reported-by: Ji Jianwen <jiji@redhat.com>
Suggested-by: Neil Horman <nhorman@tuxdriver.com>
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Notes:
    From David:
    - Removed superfluous calls to local_bh_{enable,disable}()
    
    From Hannes:
    - In the same line, removed _bh() from calls on sctp_init_sock()
    - Took the shot and fixed/added { } for else block in same region
    
    Thanks!

 include/net/netns/sctp.h   |  1 +
 include/net/sctp/structs.h |  4 ++++
 net/sctp/socket.c          | 43 ++++++++++++++++++++++++++++++++-----------
 3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..8ba379f9e4678d7f00209a6b2ac12d41d82f4b25 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -31,6 +31,7 @@ struct netns_sctp {
 	struct list_head addr_waitq;
 	struct timer_list addr_wq_timer;
 	struct list_head auto_asconf_splist;
+	/* Lock that protects both addr_waitq and auto_asconf_splist */
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..495c87e367b3f2e8941807f56a77d2e14469bfed 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,10 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations
+	 */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..5f6c4e61325b65822be525d75ebe3bb7357b97e7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1528,8 +1528,10 @@ static void sctp_close(struct sock *sk, long timeout)
 
 	/* Supposedly, no process has access to the socket, but
 	 * the net layers still may.
+	 * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+	 * held and that should be grabbed before socket lock.
 	 */
-	local_bh_disable();
+	spin_lock_bh(&net->sctp.addr_wq_lock);
 	bh_lock_sock(sk);
 
 	/* Hold the sock, since sk_common_release() will put sock_put()
@@ -1539,7 +1541,7 @@ static void sctp_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 
 	bh_unlock_sock(sk);
-	local_bh_enable();
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
 
 	sock_put(sk);
 
@@ -3580,6 +3582,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	if (val == 0 && sp->do_auto_asconf) {
 		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
@@ -3588,6 +3591,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
 	}
+	spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	return 0;
 }
 
@@ -4121,18 +4125,28 @@ static int sctp_init_sock(struct sock *sk)
 	local_bh_disable();
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
+
+	/* Nothing can fail after this block, otherwise
+	 * sctp_destroy_sock() will be called without addr_wq_lock held
+	 */
 	if (net->sctp.default_auto_asconf) {
+		spin_lock(&sock_net(sk)->sctp.addr_wq_lock);
 		list_add_tail(&sp->auto_asconf_list,
 		    &net->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
-	} else
+		spin_unlock(&sock_net(sk)->sctp.addr_wq_lock);
+	} else {
 		sp->do_auto_asconf = 0;
+	}
+
 	local_bh_enable();
 
 	return 0;
 }
 
-/* Cleanup any SCTP per socket resources.  */
+/* Cleanup any SCTP per socket resources. Must be called with
+ * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true
+ */
 static void sctp_destroy_sock(struct sock *sk)
 {
 	struct sctp_sock *sp;
@@ -7195,6 +7209,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7236,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7243,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v5] sctp: fix ASCONF list handling
  2015-06-12 13:16                                             ` [PATCH v5] " marcelo.leitner
@ 2015-06-14 19:56                                               ` David Miller
  0 siblings, 0 replies; 34+ messages in thread
From: David Miller @ 2015-06-14 19:56 UTC (permalink / raw)
  To: marcelo.leitner
  Cc: nhorman, hannes, netdev, linux-sctp, daniel, vyasevich, micchie

From: marcelo.leitner@gmail.com
Date: Fri, 12 Jun 2015 10:16:41 -0300

> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> 
> ->auto_asconf_splist is per namespace and mangled by functions like
> sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> 
> Also, the call to inet_sk_copy_descendant() was backuping
> ->auto_asconf_list through the copy but was not honoring
> ->do_auto_asconf, which could lead to list corruption if it was
> different between both sockets.
> 
> This commit thus fixes the list handling by using ->addr_wq_lock
> spinlock to protect the list. A special handling is done upon socket
> creation and destruction for that. Error handlig on sctp_init_sock()
> will never return an error after having initialized asconf, so
> sctp_destroy_sock() can be called without addrq_wq_lock. The lock now
> will be take on sctp_close_sock(), before locking the socket, so we
> don't do it in inverse order compared to sctp_addr_wq_timeout_handler().
> 
> Instead of taking the lock on sctp_sock_migrate() for copying and
> restoring the list values, it's preferred to avoid rewritting it by
> implementing sctp_copy_descendant().
> 
> Issue was found with a test application that kept flipping sysctl
> default_auto_asconf on and off, but one could trigger it by issuing
> simultaneous setsockopt() calls on multiple sockets or by
> creating/destroying sockets fast enough. This is only triggerable
> locally.
> 
> Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> Reported-by: Ji Jianwen <jiji@redhat.com>
> Suggested-by: Neil Horman <nhorman@tuxdriver.com>
> Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

Applied, thank you.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v3 2/2] sctp: fix ASCONF list handling
  2015-06-04 14:27                     ` Neil Horman
  2015-06-05 14:18                       ` Marcelo Ricardo Leitner
  2015-06-05 17:08                       ` [PATCH v3 " mleitner
@ 2015-06-05 17:08                       ` mleitner
  2015-06-08 20:09                         ` Hannes Frederic Sowa
  2 siblings, 1 reply; 34+ messages in thread
From: mleitner @ 2015-06-05 17:08 UTC (permalink / raw)
  To: Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by rcu-ifying it and using
->addr_wq_lock spinlock to protect against multiple writers.

Instead of taking the lock on sctp_sock_migrate() for copying and
restoring the list values, it's preferred to avoid rewritting it by
implementing sctp_copy_descendant().

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off, but one could trigger it by issuing
simultaneous setsockopt() calls on multiple sockets or by
creating/destroying sockets fast enough. This is only triggerable
locally.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Reported-by: Ji Jianwen <jiji@redhat.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 include/net/netns/sctp.h   |  3 ++-
 include/net/sctp/structs.h |  4 ++++
 net/sctp/protocol.c        |  3 ++-
 net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 9e53412c4ed829e8e45777a6d95406d490dbaa75..161de47076ebee2ffce2b777a61e673e576fd982 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -30,7 +30,8 @@ struct netns_sctp {
 	struct list_head local_addr_list;
 	struct list_head __rcu addr_waitq;
 	struct timer_list addr_wq_timer;
-	struct list_head auto_asconf_splist;
+	struct list_head __rcu auto_asconf_splist;
+	/* Lock that protects both addr_waitq and auto_asconf_splist */
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..495c87e367b3f2e8941807f56a77d2e14469bfed 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,10 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations
+	 */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9954fb8c9a9455d5ad7a627e2d7f9a1fef861fc2..233df013dc7ece3efb6739f3861e70f3324d26ab 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -664,7 +664,8 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 			}
 		}
 #endif
-		list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
+		list_for_each_entry_rcu(sp, &net->sctp.auto_asconf_splist,
+					auto_asconf_list) {
 			struct sock *sk;
 
 			sk = sctp_opt2sk(sp);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..475f84f80e5448f1f33d213dd9c3eae08365610f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3580,14 +3580,16 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	if (val == 0 && sp->do_auto_asconf) {
-		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
+		list_del_rcu(&sp->auto_asconf_list);
 	} else if (val && !sp->do_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &sock_net(sk)->sctp.auto_asconf_splist);
 	}
+	spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	return 0;
 }
 
@@ -4122,9 +4124,11 @@ static int sctp_init_sock(struct sock *sk)
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
 	if (net->sctp.default_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &net->sctp.auto_asconf_splist);
+		spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &net->sctp.auto_asconf_splist);
+		spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	} else
 		sp->do_auto_asconf = 0;
 	local_bh_enable();
@@ -4148,8 +4152,10 @@ static void sctp_destroy_sock(struct sock *sk)
 		return;
 
 	if (sp->do_auto_asconf) {
+		spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 		sp->do_auto_asconf = 0;
-		list_del(&sp->auto_asconf_list);
+		list_del_rcu(&sp->auto_asconf_list);
+		spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	}
 	sctp_endpoint_free(sp->ep);
 	local_bh_disable();
@@ -7195,6 +7201,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7228,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7235,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 2/2] sctp: fix ASCONF list handling
  2015-06-05 17:08                       ` [PATCH v3 2/2] " mleitner
@ 2015-06-08 20:09                         ` Hannes Frederic Sowa
  2015-06-09 15:37                           ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 34+ messages in thread
From: Hannes Frederic Sowa @ 2015-06-08 20:09 UTC (permalink / raw)
  To: mleitner
  Cc: Neil Horman, netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich,
	Michio Honda

On Fr, 2015-06-05 at 14:08 -0300, mleitner@redhat.com wrote:
>       if (sp->do_auto_asconf) {
> +             spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
>               sp->do_auto_asconf = 0;
> -             list_del(&sp->auto_asconf_list);
> +             list_del_rcu(&sp->auto_asconf_list);
> +             spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
>       }

This also looks a bit unsafe to me:

My proposal would be to sock_hold/sock_put the sockets when pushing them
onto the auto_asconf_list and defer the modifications on the list until
we don't need to hold socket lock anymore (in syscalls we do have a reference
anyway).

addr_wq_lock then is only used either without lock_sock at all or only
in order addr_wq_lock -> lock_sock, which does not cause any locking
ordering issues.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 2/2] sctp: fix ASCONF list handling
  2015-06-08 20:09                         ` Hannes Frederic Sowa
@ 2015-06-09 15:37                           ` Marcelo Ricardo Leitner
  2015-06-09 17:04                             ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-06-09 15:37 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Neil Horman, netdev, linux-sctp, Daniel Borkmann, Vlad Yasevich,
	Michio Honda

On Mon, Jun 08, 2015 at 10:09:50PM +0200, Hannes Frederic Sowa wrote:
> On Fr, 2015-06-05 at 14:08 -0300, mleitner@redhat.com wrote:
> >       if (sp->do_auto_asconf) {
> > +             spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
> >               sp->do_auto_asconf = 0;
> > -             list_del(&sp->auto_asconf_list);
> > +             list_del_rcu(&sp->auto_asconf_list);
> > +             spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
> >       }
> 
> This also looks a bit unsafe to me:
> 
> My proposal would be to sock_hold/sock_put the sockets when pushing them
> onto the auto_asconf_list and defer the modifications on the list until
                                ^^^^^^^^^^^^--- you lost me here

> we don't need to hold socket lock anymore (in syscalls we do have a reference
> anyway).

Yup.. seems we have a use-after-free with this rcu usage on
auto_asconf_splist, because if the socket was destroyed by the time the
timeout handler is running, it may still see that socket and thus we
would need two additional procedures a) to take a sock_hold() when it is
inserted on that list, and release it via call_rcu() and b) to know how
to identify such dead sockets, most likely just by checking
sp->do_auto_asconf, and skip from acting on them.

Neil, WDYT?

> addr_wq_lock then is only used either without lock_sock at all or only
> in order addr_wq_lock -> lock_sock, which does not cause any locking
> ordering issues.

No because we have to update this list on sctp_destroy_sock(), which is
called with lock_sock() held. If we add the precautions above, I think
it will be fine.

Thanks,
Marcelo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 2/2] sctp: fix ASCONF list handling
  2015-06-09 15:37                           ` Marcelo Ricardo Leitner
@ 2015-06-09 17:04                             ` Neil Horman
  0 siblings, 0 replies; 34+ messages in thread
From: Neil Horman @ 2015-06-09 17:04 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Hannes Frederic Sowa, netdev, linux-sctp, Daniel Borkmann,
	Vlad Yasevich, Michio Honda

On Tue, Jun 09, 2015 at 12:37:21PM -0300, Marcelo Ricardo Leitner wrote:
> On Mon, Jun 08, 2015 at 10:09:50PM +0200, Hannes Frederic Sowa wrote:
> > On Fr, 2015-06-05 at 14:08 -0300, mleitner@redhat.com wrote:
> > >       if (sp->do_auto_asconf) {
> > > +             spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
> > >               sp->do_auto_asconf = 0;
> > > -             list_del(&sp->auto_asconf_list);
> > > +             list_del_rcu(&sp->auto_asconf_list);
> > > +             spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
> > >       }
> > 
> > This also looks a bit unsafe to me:
> > 
> > My proposal would be to sock_hold/sock_put the sockets when pushing them
> > onto the auto_asconf_list and defer the modifications on the list until
>                                 ^^^^^^^^^^^^--- you lost me here
> 
> > we don't need to hold socket lock anymore (in syscalls we do have a reference
> > anyway).
> 
> Yup.. seems we have a use-after-free with this rcu usage on
> auto_asconf_splist, because if the socket was destroyed by the time the
> timeout handler is running, it may still see that socket and thus we
> would need two additional procedures a) to take a sock_hold() when it is
> inserted on that list, and release it via call_rcu() and b) to know how
> to identify such dead sockets, most likely just by checking
> sp->do_auto_asconf, and skip from acting on them.
> 
> Neil, WDYT?
> 
That seems like a reasonable approach.

> > addr_wq_lock then is only used either without lock_sock at all or only
> > in order addr_wq_lock -> lock_sock, which does not cause any locking
> > ordering issues.
> 
> No because we have to update this list on sctp_destroy_sock(), which is
> called with lock_sock() held. If we add the precautions above, I think
> it will be fine.
> 
Agreed.

> Thanks,
> Marcelo
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 2/2] sctp: fix ASCONF list handling
  2015-06-02 13:48                 ` Neil Horman
  2015-06-03 16:54                   ` [PATCH v2 1/2] sctp: rcu-ify addr_waitq mleitner
@ 2015-06-03 16:54                   ` mleitner
  1 sibling, 0 replies; 34+ messages in thread
From: mleitner @ 2015-06-03 16:54 UTC (permalink / raw)
  To: Neil Horman, netdev
  Cc: linux-sctp, Daniel Borkmann, Vlad Yasevich, Michio Honda

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by rcu-ifying it and using
->addr_wq_lock spinlock to protect against multiple writers.

Instead of taking the lock on sctp_sock_migrate() for copying and
restoring the list values, it's preferred to avoid rewritting it by
implementing sctp_copy_descendant().

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off, but one could trigger it by issuing
simultaneous setsockopt() calls on multiple sockets or by
creating/destroying sockets fast enough. This is only triggerable
locally.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Reported-by: Ji Jianwen <jiji@redhat.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 include/net/netns/sctp.h   |  3 ++-
 include/net/sctp/structs.h |  4 ++++
 net/sctp/protocol.c        |  3 ++-
 net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 9e53412c4ed829e8e45777a6d95406d490dbaa75..161de47076ebee2ffce2b777a61e673e576fd982 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -30,7 +30,8 @@ struct netns_sctp {
 	struct list_head local_addr_list;
 	struct list_head __rcu addr_waitq;
 	struct timer_list addr_wq_timer;
-	struct list_head auto_asconf_splist;
+	struct list_head __rcu auto_asconf_splist;
+	/* Lock that protects both addr_waitq and auto_asconf_splist */
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..495c87e367b3f2e8941807f56a77d2e14469bfed 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,10 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations
+	 */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a5089883b28195f3aef69ef35b5397322a01126f..e4b24ef9340da210de0b0076f8a42080e991527e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -665,7 +665,8 @@ static void sctp_addr_wq_timeout_handler(unsigned long arg)
 			}
 		}
 #endif
-		list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
+		list_for_each_entry_rcu(sp, &net->sctp.auto_asconf_splist,
+					auto_asconf_list) {
 			struct sock *sk;
 
 			sk = sctp_opt2sk(sp);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..475f84f80e5448f1f33d213dd9c3eae08365610f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3580,14 +3580,16 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	if (val == 0 && sp->do_auto_asconf) {
-		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
+		list_del_rcu(&sp->auto_asconf_list);
 	} else if (val && !sp->do_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &sock_net(sk)->sctp.auto_asconf_splist);
 	}
+	spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	return 0;
 }
 
@@ -4122,9 +4124,11 @@ static int sctp_init_sock(struct sock *sk)
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
 	if (net->sctp.default_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &net->sctp.auto_asconf_splist);
+		spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &net->sctp.auto_asconf_splist);
+		spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	} else
 		sp->do_auto_asconf = 0;
 	local_bh_enable();
@@ -4148,8 +4152,10 @@ static void sctp_destroy_sock(struct sock *sk)
 		return;
 
 	if (sp->do_auto_asconf) {
+		spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 		sp->do_auto_asconf = 0;
-		list_del(&sp->auto_asconf_list);
+		list_del_rcu(&sp->auto_asconf_list);
+		spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	}
 	sctp_endpoint_free(sp->ep);
 	local_bh_disable();
@@ -7195,6 +7201,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7228,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7235,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.
-- 
2.4.1

^ permalink raw reply related	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2015-06-14 19:56 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-05-28  0:52 [PATCH] sctp: fix ASCONF list handling mleitner
2015-05-28 10:15 ` Neil Horman
2015-05-28 11:17   ` Marcelo Ricardo Leitner
2015-05-28 13:27     ` Marcelo Ricardo Leitner
2015-05-28 14:31       ` Neil Horman
2015-05-28 14:46       ` Marcelo Ricardo Leitner
2015-05-29 13:17         ` Neil Horman
2015-05-29 16:50           ` Marcelo Ricardo Leitner
2015-06-01 14:00             ` Neil Horman
2015-06-01 22:28               ` Marcelo Ricardo Leitner
2015-06-02 13:48                 ` Neil Horman
2015-06-03 16:54                   ` [PATCH v2 1/2] sctp: rcu-ify addr_waitq mleitner
2015-06-03 17:19                     ` Marcelo Ricardo Leitner
2015-06-04 14:27                     ` Neil Horman
2015-06-05 14:18                       ` Marcelo Ricardo Leitner
2015-06-05 17:08                       ` [PATCH v3 " mleitner
2015-06-08 13:58                         ` Neil Horman
2015-06-08 14:46                         ` Hannes Frederic Sowa
2015-06-08 14:59                           ` Hannes Frederic Sowa
2015-06-08 15:19                             ` Neil Horman
2015-06-08 15:37                               ` Hannes Frederic Sowa
2015-06-09 11:36                                 ` Neil Horman
2015-06-09 19:32                                   ` Marcelo Ricardo Leitner
2015-06-10 13:31                                     ` Marcelo Ricardo Leitner
2015-06-10 19:14                                       ` Neil Horman
2015-06-11 14:30                                         ` [PATCH v4] sctp: fix ASCONF list handling mleitner
2015-06-11 23:31                                           ` David Miller
2015-06-12 13:16                                             ` [PATCH v5] " marcelo.leitner
2015-06-14 19:56                                               ` David Miller
2015-06-05 17:08                       ` [PATCH v3 2/2] " mleitner
2015-06-08 20:09                         ` Hannes Frederic Sowa
2015-06-09 15:37                           ` Marcelo Ricardo Leitner
2015-06-09 17:04                             ` Neil Horman
2015-06-03 16:54                   ` [PATCH v2 " mleitner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).