[PATCH v4 1/1] IB/ipoib: fix for rare multicast join race condition

linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 1/1] IB/ipoib: fix for rare multicast join race condition
@ 2016-02-11 16:10 Alex Estrin
       [not found] ` <20160211161052.22025.1263.stgit-u2TXY/5TJkdZ7WVY1cDZ9q2pdiUAq4bhAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 2+ messages in thread
From: Alex Estrin @ 2016-02-11 16:10 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Doug Ledford
  Cc: erezsh-VPRAkNaXOzVWk0Htik3J/w

A narrow window for race condition still exist between
multicast join thread and *dev_flush workers.
A kernel crash caused by prolong erratic link state changes
was observed (most likely a faulty cabling):

[167275.656270] BUG: unable to handle kernel NULL pointer dereference at
0000000000000020
[167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
[167275.674443] PGD 0
[167275.677373] Oops: 0000 [#1] SMP
...
[167275.977530] Call Trace:
[167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
[167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
[ib_ipoib]
[167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
[167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
[167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
[167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
Here was a hit spot:
ipoib_mcast_join() {
..............
      rec.qkey      = priv->broadcast->mcmember.qkey;
                                       ^^^^^^^
.....
 }
Proposed patch should prevent multicast join task to continue
if link state change is detected.

Signed-off-by: Alex Estrin <alex.estrin-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Changes from v3:
- sync with priv->lock before flag check.
Chages from v2:
- Move check for OPER_UP flag state to mcast_join() to
ensure no event worker is in progress.
- minor style fixes.
Changes from v1:
- No need to lock again if error detected.
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   15 +++++++++++++--
 1 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 050dfa1..938e5d5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -456,7 +456,7 @@ out_locked:
 	return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
+static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_sa_multicast *multicast;
@@ -464,8 +464,16 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 		.join_state = 1
 	};
 	ib_sa_comp_mask comp_mask;
+	unsigned long flags;
 	int ret = 0;
 
+	spin_lock_irqsave(&priv->lock, flags);
+	if (!priv->broadcast ||
+	    !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return -EINVAL;
+	}
+
 	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
 
 	rec.mgid     = mcast->mcmember.mgid;
@@ -525,6 +533,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 			rec.join_state = 4;
 #endif
 	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
 					 &rec, comp_mask, GFP_KERNEL,
@@ -539,6 +548,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 		spin_unlock_irq(&priv->lock);
 		complete(&mcast->done);
 	}
+	return 0;
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
@@ -621,7 +631,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
 				init_completion(&mcast->done);
 				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 				spin_unlock_irq(&priv->lock);
-				ipoib_mcast_join(dev, mcast);
+				if (!ipoib_mcast_join(dev, mcast))
+					return;
 				spin_lock_irq(&priv->lock);
 			} else if (!delay_until ||
 				 time_before(mcast->delay_until, delay_until))

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 2+ messages in thread

[parent not found: <20160211161052.22025.1263.stgit-u2TXY/5TJkdZ7WVY1cDZ9q2pdiUAq4bhAL8bYrjMMd8@public.gmane.org>]

* Re: [PATCH v4 1/1] IB/ipoib: fix for rare multicast join race condition
       [not found] ` <20160211161052.22025.1263.stgit-u2TXY/5TJkdZ7WVY1cDZ9q2pdiUAq4bhAL8bYrjMMd8@public.gmane.org>
@ 2016-02-11 16:24   ` Doug Ledford
  0 siblings, 0 replies; 2+ messages in thread
From: Doug Ledford @ 2016-02-11 16:24 UTC (permalink / raw)
  To: Alex Estrin, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: erezsh-VPRAkNaXOzVWk0Htik3J/w

[-- Attachment #1: Type: text/plain, Size: 5467 bytes --]

On 02/11/2016 11:10 AM, Alex Estrin wrote:
> A narrow window for race condition still exist between
> multicast join thread and *dev_flush workers.
> A kernel crash caused by prolong erratic link state changes
> was observed (most likely a faulty cabling):
> 
> [167275.656270] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000020
> [167275.665973] IP: [<ffffffffa05f8f2e>] ipoib_mcast_join+0xae/0x1d0 [ib_ipoib]
> [167275.674443] PGD 0
> [167275.677373] Oops: 0000 [#1] SMP
> ...
> [167275.977530] Call Trace:
> [167275.982225]  [<ffffffffa05f92f0>] ? ipoib_mcast_free+0x200/0x200 [ib_ipoib]
> [167275.992024]  [<ffffffffa05fa1b7>] ipoib_mcast_join_task+0x2a7/0x490
> [ib_ipoib]
> [167276.002149]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
> [167276.010754]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
> [167276.019088]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
> [167276.027737]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
> Here was a hit spot:
> ipoib_mcast_join() {
> ..............
>       rec.qkey      = priv->broadcast->mcmember.qkey;
>                                        ^^^^^^^
> .....
>  }
> Proposed patch should prevent multicast join task to continue
> if link state change is detected.
> 
> Signed-off-by: Alex Estrin <alex.estrin-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> 
> Changes from v3:
> - sync with priv->lock before flag check.
> Chages from v2:
> - Move check for OPER_UP flag state to mcast_join() to
> ensure no event worker is in progress.
> - minor style fixes.
> Changes from v1:
> - No need to lock again if error detected.
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   15 +++++++++++++--
>  1 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 050dfa1..938e5d5 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -456,7 +456,7 @@ out_locked:
>  	return status;
>  }
>  
> -static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
> +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ib_sa_multicast *multicast;
> @@ -464,8 +464,16 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  		.join_state = 1
>  	};
>  	ib_sa_comp_mask comp_mask;
> +	unsigned long flags;
>  	int ret = 0;
>  
> +	spin_lock_irqsave(&priv->lock, flags);

There's no sense using flags here.  We already know this function is
broken if it's ever called from changing contexts that require you to
save flags because we take/release the priv->lock in the error handling
for the ib_sa_join_multicast call without flags.

But, that raises a larger issue.  If we are now going to have the
majority of the function under the priv->lock, then I think we need to
rethink our entry into this function.  All these locks/releases are
expensive in terms of locked memory cycles.  I would rather see us
change this function so that it's called explicitly with the priv->lock
held, keep it held through our work, only release it to call
ib_sa_join_multicast, and then retake it before returning.

Also, a person might be tempted to move the test below to right before
the call to ipoib_mcast_join in ipoib_mcast_join_task as an
optimization.  I think I would prefer not to do that for the sake of
clarity of test/action pair that must remain under lock.  It's clearer
to people reading the code later that we hold the lock because the only
guarantee that mcast->mcmember remains valid and referenceable is that
we passed our "interface is still up test" and help the lock through the
accesses since the interface could go down in another context asoon as
we release the lock.

You'll obviously have to adjust the locking in the task thread too.

Please respin.

> +	if (!priv->broadcast ||
> +	    !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
> +		spin_unlock_irqrestore(&priv->lock, flags);
> +		return -EINVAL;
> +	}
> +
>  	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
>  
>  	rec.mgid     = mcast->mcmember.mgid;
> @@ -525,6 +533,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  			rec.join_state = 4;
>  #endif
>  	}
> +	spin_unlock_irqrestore(&priv->lock, flags);
>  
>  	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
>  					 &rec, comp_mask, GFP_KERNEL,
> @@ -539,6 +548,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
>  		spin_unlock_irq(&priv->lock);
>  		complete(&mcast->done);
>  	}
> +	return 0;
>  }
>  
>  void ipoib_mcast_join_task(struct work_struct *work)
> @@ -621,7 +631,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
>  				init_completion(&mcast->done);
>  				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
>  				spin_unlock_irq(&priv->lock);
> -				ipoib_mcast_join(dev, mcast);
> +				if (!ipoib_mcast_join(dev, mcast))
> +					return;
>  				spin_lock_irq(&priv->lock);
>  			} else if (!delay_until ||
>  				 time_before(mcast->delay_until, delay_until))
> 


-- 
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
              GPG KeyID: 0E572FDD



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 884 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2016-02-11 16:24 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-11 16:10 [PATCH v4 1/1] IB/ipoib: fix for rare multicast join race condition Alex Estrin
     [not found] ` <20160211161052.22025.1263.stgit-u2TXY/5TJkdZ7WVY1cDZ9q2pdiUAq4bhAL8bYrjMMd8@public.gmane.org>
2016-02-11 16:24   ` Doug Ledford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).