[PATCH v2] avoid race condition between start_xmit and cm_rep

public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
@ 2018-08-17 21:31 Aaron Knister
  2018-08-19 17:28 ` Leon Romanovsky
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Aaron Knister @ 2018-08-17 21:31 UTC (permalink / raw)
  To: linux-rdma; +Cc: stable, Ira Weiny, John Fleck

Inside of start_xmit() the call to check if the connection is up and the
queueing of the packets for later transmission is not atomic which
leaves a window where cm_rep_handler can run, set the connection up,
dequeue pending packets and leave the subsequently queued packets by
start_xmit() sitting on neigh->queue until they're dropped when the
connection is torn down. This only applies to connected mode. These
dropped packets can really upset TCP, for example,  and cause
multi-minute delays in transmission for open connections.

Here's the code in start_xmit where we check to see if the connection
is up:

       if (ipoib_cm_get(neigh)) {
               if (ipoib_cm_up(neigh)) {
                       ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
                       goto unref;
               }
       }

The race occurs if cm_rep_handler execution occurs after the above
connection check (specifically if it gets to the point where it acquires
priv->lock to dequeue pending skb's) but before the below code snippet
in start_xmit where packets are queued.

       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
               push_pseudo_header(skb, phdr->hwaddr);
               spin_lock_irqsave(&priv->lock, flags);
               __skb_queue_tail(&neigh->queue, skb);
               spin_unlock_irqrestore(&priv->lock, flags);
       } else {
               ++dev->stats.tx_dropped;
               dev_kfree_skb_any(skb);
       }

The patch re-checks ipoib_cm_up with priv->lock held to avoid this
race condition. Since odds are the conn should be up most of the time
(and thus the connection *not* down most of the time) we don't hold the
lock for the first check attempt to avoid a slowdown from unecessary
locking for the majority of the packets transmitted during the
connection's life.

Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26cde95b..a950c916 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+static bool defer_neigh_skb(struct sk_buff *skb,
+			    struct net_device *dev,
+			    struct ipoib_neigh *neigh,
+			    struct ipoib_pseudo_header *phdr)
+{
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		push_pseudo_header(skb, phdr->hwaddr);
+		__skb_queue_tail(&neigh->queue, skb);
+		return true;
+	}
+
+	return false;
+}
+
+
 static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_pseudo_header *phdr;
 	struct ipoib_header *header;
 	unsigned long flags;
+	bool deferred_pkt = true;
 
 	phdr = (struct ipoib_pseudo_header *) skb->data;
 	skb_pull(skb, sizeof(*phdr));
@@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
 			goto unref;
 		}
+		/*
+		 * Re-check ipoib_cm_up with priv->lock held to avoid
+		 * race condition between start_xmit and skb_dequeue in
+		 * cm_rep_handler. Since odds are the conn should be up
+		 * most of the time, we don't hold the lock for the
+		 * first check above
+		 */
+		spin_lock_irqsave(&priv->lock, flags);
+		if (ipoib_cm_up(neigh)) {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+		} else {
+			deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+
+		goto unref;
 	} else if (neigh->ah && neigh->ah->valid) {
 		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
 						IPOIB_QPN(phdr->hwaddr));
@@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		neigh_refresh_path(neigh, phdr->hwaddr, dev);
 	}
 
-	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
-		push_pseudo_header(skb, phdr->hwaddr);
-		spin_lock_irqsave(&priv->lock, flags);
-		__skb_queue_tail(&neigh->queue, skb);
-		spin_unlock_irqrestore(&priv->lock, flags);
-	} else {
+	spin_lock_irqsave(&priv->lock, flags);
+	deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+unref:
+	if (!deferred_pkt) {
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
 	}
 
-unref:
 	ipoib_neigh_put(neigh);
 
 	return NETDEV_TX_OK;
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-17 21:31 [PATCH v2] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
@ 2018-08-19 17:28 ` Leon Romanovsky
  2018-08-20  4:38   ` Weiny, Ira
  2018-08-20  4:34 ` Weiny, Ira
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Leon Romanovsky @ 2018-08-19 17:28 UTC (permalink / raw)
  To: Aaron Knister; +Cc: linux-rdma, stable, Ira Weiny, John Fleck

[-- Attachment #1: Type: text/plain, Size: 2193 bytes --]

On Fri, Aug 17, 2018 at 05:31:26PM -0400, Aaron Knister wrote:
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
>
> Here's the code in start_xmit where we check to see if the connection
> is up:
>
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
>
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
>
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
>
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> race condition. Since odds are the conn should be up most of the time
> (and thus the connection *not* down most of the time) we don't hold the
> lock for the first check attempt to avoid a slowdown from unecessary
> locking for the majority of the packets transmitted during the
> connection's life.
>
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>

If you want this fix in stable@, you will need to provide Fixes line
here and don't add stable@ in CC-list.

Thanks,
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-19 17:28 ` Leon Romanovsky
@ 2018-08-20  4:38   ` Weiny, Ira
  0 siblings, 0 replies; 11+ messages in thread
From: Weiny, Ira @ 2018-08-20  4:38 UTC (permalink / raw)
  To: Leon Romanovsky, Aaron Knister
  Cc: linux-rdma@vger.kernel.org, stable@vger.kernel.org, Fleck, John

> -----Original Message-----
> From: Leon Romanovsky [mailto:leon@kernel.org]
> Sent: Sunday, August 19, 2018 10:29 AM
> To: Aaron Knister <aaron.s.knister@nasa.gov>
> Cc: linux-rdma@vger.kernel.org; stable@vger.kernel.org; Weiny, Ira
> <ira.weiny@intel.com>; Fleck, John <john.fleck@intel.com>
> Subject: Re: [PATCH v2] avoid race condition between start_xmit and
> cm_rep_handler
> 
> On Fri, Aug 17, 2018 at 05:31:26PM -0400, Aaron Knister wrote:
> > Inside of start_xmit() the call to check if the connection is up and
> > the queueing of the packets for later transmission is not atomic which
> > leaves a window where cm_rep_handler can run, set the connection up,
> > dequeue pending packets and leave the subsequently queued packets by
> > start_xmit() sitting on neigh->queue until they're dropped when the
> > connection is torn down. This only applies to connected mode. These
> > dropped packets can really upset TCP, for example,  and cause
> > multi-minute delays in transmission for open connections.
> >
> > Here's the code in start_xmit where we check to see if the connection
> > is up:
> >
> >        if (ipoib_cm_get(neigh)) {
> >                if (ipoib_cm_up(neigh)) {
> >                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> >                        goto unref;
> >                }
> >        }
> >
> > The race occurs if cm_rep_handler execution occurs after the above
> > connection check (specifically if it gets to the point where it
> > acquires
> > priv->lock to dequeue pending skb's) but before the below code snippet
> > in start_xmit where packets are queued.
> >
> >        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
> {
> >                push_pseudo_header(skb, phdr->hwaddr);
> >                spin_lock_irqsave(&priv->lock, flags);
> >                __skb_queue_tail(&neigh->queue, skb);
> >                spin_unlock_irqrestore(&priv->lock, flags);
> >        } else {
> >                ++dev->stats.tx_dropped;
> >                dev_kfree_skb_any(skb);
> >        }
> >
> > The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> > race condition. Since odds are the conn should be up most of the time
> > (and thus the connection *not* down most of the time) we don't hold
> > the lock for the first check attempt to avoid a slowdown from
> > unecessary locking for the majority of the packets transmitted during
> > the connection's life.
> >
> > Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
> 
> If you want this fix in stable@, you will need to provide Fixes line here and
> don't add stable@ in CC-list.


Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path")
Cc: stable@vger.kernel.org


> 
> Thanks,
> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-17 21:31 [PATCH v2] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
  2018-08-19 17:28 ` Leon Romanovsky
@ 2018-08-20  4:34 ` Weiny, Ira
  2018-08-20  6:36 ` Erez Shitrit
  2018-08-20 11:49 ` Estrin, Alex
  3 siblings, 0 replies; 11+ messages in thread
From: Weiny, Ira @ 2018-08-20  4:34 UTC (permalink / raw)
  To: Aaron Knister, linux-rdma@vger.kernel.org
  Cc: stable@vger.kernel.org, Fleck, John

> -----Original Message-----
> From: Aaron Knister [mailto:aaron.s.knister@nasa.gov]
> Sent: Friday, August 17, 2018 2:31 PM
> To: linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Weiny, Ira <ira.weiny@intel.com>; Fleck, John
> <john.fleck@intel.com>
> Subject: [PATCH v2] avoid race condition between start_xmit and
> cm_rep_handler
> 
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which leaves a
> window where cm_rep_handler can run, set the connection up, dequeue
> pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause multi-minute
> delays in transmission for open connections.
> 
> Here's the code in start_xmit where we check to see if the connection is up:
> 
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
> 
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
> 
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
> 
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this race
> condition. Since odds are the conn should be up most of the time (and thus
> the connection *not* down most of the time) we don't hold the lock for the
> first check attempt to avoid a slowdown from unecessary locking for the
> majority of the packets transmitted during the connection's life.
> 
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>

Tested-by: Ira Weiny <ira.weiny@intel.com>

> ---
>  drivers/infiniband/ulp/ipoib/ipoib_main.c | 46
> ++++++++++++++++++++++++++-----
>  1 file changed, 39 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 26cde95b..a950c916 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb,
> struct net_device *dev,
>  	spin_unlock_irqrestore(&priv->lock, flags);  }
> 
> +static bool defer_neigh_skb(struct sk_buff *skb,
> +			    struct net_device *dev,
> +			    struct ipoib_neigh *neigh,
> +			    struct ipoib_pseudo_header *phdr) {
> +	if (skb_queue_len(&neigh->queue) <
> IPOIB_MAX_PATH_REC_QUEUE) {
> +		push_pseudo_header(skb, phdr->hwaddr);
> +		__skb_queue_tail(&neigh->queue, skb);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +
>  static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device
> *dev)  {
>  	struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1101,6 +1116,7
> @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct
> net_device *dev)
>  	struct ipoib_pseudo_header *phdr;
>  	struct ipoib_header *header;
>  	unsigned long flags;
> +	bool deferred_pkt = true;
> 
>  	phdr = (struct ipoib_pseudo_header *) skb->data;
>  	skb_pull(skb, sizeof(*phdr));
> @@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff
> *skb, struct net_device *dev)
>  			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>  			goto unref;
>  		}
> +		/*
> +		 * Re-check ipoib_cm_up with priv->lock held to avoid
> +		 * race condition between start_xmit and skb_dequeue in
> +		 * cm_rep_handler. Since odds are the conn should be up
> +		 * most of the time, we don't hold the lock for the
> +		 * first check above
> +		 */
> +		spin_lock_irqsave(&priv->lock, flags);
> +		if (ipoib_cm_up(neigh)) {
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> +		} else {
> +			deferred_pkt = defer_neigh_skb(skb, dev, neigh,
> phdr);
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +		}
> +
> +		goto unref;
>  	} else if (neigh->ah && neigh->ah->valid) {
>  		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
>  						IPOIB_QPN(phdr->hwaddr));
> @@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct
> sk_buff *skb, struct net_device *dev)
>  		neigh_refresh_path(neigh, phdr->hwaddr, dev);
>  	}
> 
> -	if (skb_queue_len(&neigh->queue) <
> IPOIB_MAX_PATH_REC_QUEUE) {
> -		push_pseudo_header(skb, phdr->hwaddr);
> -		spin_lock_irqsave(&priv->lock, flags);
> -		__skb_queue_tail(&neigh->queue, skb);
> -		spin_unlock_irqrestore(&priv->lock, flags);
> -	} else {
> +	spin_lock_irqsave(&priv->lock, flags);
> +	deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
> +	spin_unlock_irqrestore(&priv->lock, flags);
> +
> +unref:
> +	if (!deferred_pkt) {
>  		++dev->stats.tx_dropped;
>  		dev_kfree_skb_any(skb);
>  	}
> 
> -unref:
>  	ipoib_neigh_put(neigh);
> 
>  	return NETDEV_TX_OK;
> --
> 2.12.3

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-17 21:31 [PATCH v2] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
  2018-08-19 17:28 ` Leon Romanovsky
  2018-08-20  4:34 ` Weiny, Ira
@ 2018-08-20  6:36 ` Erez Shitrit
  2018-08-20 16:28   ` Jason Gunthorpe
  2018-08-20 11:49 ` Estrin, Alex
  3 siblings, 1 reply; 11+ messages in thread
From: Erez Shitrit @ 2018-08-20  6:36 UTC (permalink / raw)
  To: Aaron Knister
  Cc: RDMA mailing list, stable, Ira Weiny, John Fleck, Feras Daoud

Hi,

Did you check the option to hold the netif_tx_lock_xxx() in
ipoib_cm_rep_handler function (over the line
set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?

Thanks,

On Sat, Aug 18, 2018 at 12:31 AM, Aaron Knister
<aaron.s.knister@nasa.gov> wrote:
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
>
> Here's the code in start_xmit where we check to see if the connection
> is up:
>
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
>
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
>
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
>
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> race condition. Since odds are the conn should be up most of the time
> (and thus the connection *not* down most of the time) we don't hold the
> lock for the first check attempt to avoid a slowdown from unecessary
> locking for the majority of the packets transmitted during the
> connection's life.
>
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++-----
>  1 file changed, 39 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 26cde95b..a950c916 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
>         spin_unlock_irqrestore(&priv->lock, flags);
>  }
>
> +static bool defer_neigh_skb(struct sk_buff *skb,
> +                           struct net_device *dev,
> +                           struct ipoib_neigh *neigh,
> +                           struct ipoib_pseudo_header *phdr)
> +{
> +       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> +               push_pseudo_header(skb, phdr->hwaddr);
> +               __skb_queue_tail(&neigh->queue, skb);
> +               return true;
> +       }
> +
> +       return false;
> +}
> +
> +
>  static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>         struct ipoib_dev_priv *priv = ipoib_priv(dev);
> @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>         struct ipoib_pseudo_header *phdr;
>         struct ipoib_header *header;
>         unsigned long flags;
> +       bool deferred_pkt = true;
>
>         phdr = (struct ipoib_pseudo_header *) skb->data;
>         skb_pull(skb, sizeof(*phdr));
> @@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>                         ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                         goto unref;
>                 }
> +               /*
> +                * Re-check ipoib_cm_up with priv->lock held to avoid
> +                * race condition between start_xmit and skb_dequeue in
> +                * cm_rep_handler. Since odds are the conn should be up
> +                * most of the time, we don't hold the lock for the
> +                * first check above
> +                */
> +               spin_lock_irqsave(&priv->lock, flags);
> +               if (ipoib_cm_up(neigh)) {
> +                       spin_unlock_irqrestore(&priv->lock, flags);
> +                       ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> +               } else {
> +                       deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
> +                       spin_unlock_irqrestore(&priv->lock, flags);
> +               }
> +
> +               goto unref;
>         } else if (neigh->ah && neigh->ah->valid) {
>                 neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
>                                                 IPOIB_QPN(phdr->hwaddr));
> @@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>                 neigh_refresh_path(neigh, phdr->hwaddr, dev);
>         }
>
> -       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> -               push_pseudo_header(skb, phdr->hwaddr);
> -               spin_lock_irqsave(&priv->lock, flags);
> -               __skb_queue_tail(&neigh->queue, skb);
> -               spin_unlock_irqrestore(&priv->lock, flags);
> -       } else {
> +       spin_lock_irqsave(&priv->lock, flags);
> +       deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
> +       spin_unlock_irqrestore(&priv->lock, flags);
> +
> +unref:
> +       if (!deferred_pkt) {
>                 ++dev->stats.tx_dropped;
>                 dev_kfree_skb_any(skb);
>         }
>
> -unref:
>         ipoib_neigh_put(neigh);
>
>         return NETDEV_TX_OK;
> --
> 2.12.3
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-20  6:36 ` Erez Shitrit
@ 2018-08-20 16:28   ` Jason Gunthorpe
  2018-08-20 17:40     ` Aaron Knister
  0 siblings, 1 reply; 11+ messages in thread
From: Jason Gunthorpe @ 2018-08-20 16:28 UTC (permalink / raw)
  To: Erez Shitrit
  Cc: Aaron Knister, RDMA mailing list, stable, Ira Weiny, John Fleck,
	Feras Daoud

On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
> Hi,
> 
> Did you check the option to hold the netif_tx_lock_xxx() in
> ipoib_cm_rep_handler function (over the line
> set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?

That does seem better, then the test_bit in the datapath could become
non-atomic too :)

Jason

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-20 16:28   ` Jason Gunthorpe
@ 2018-08-20 17:40     ` Aaron Knister
  2018-08-20 19:20       ` [non-nasa source] " Aaron Knister
  0 siblings, 1 reply; 11+ messages in thread
From: Aaron Knister @ 2018-08-20 17:40 UTC (permalink / raw)
  To: Jason Gunthorpe, Erez Shitrit
  Cc: RDMA mailing list, stable, Ira Weiny, John Fleck, Feras Daoud

On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
> On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
>> Hi,
>>
>> Did you check the option to hold the netif_tx_lock_xxx() in
>> ipoib_cm_rep_handler function (over the line
>> set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
> 
> That does seem better, then the test_bit in the datapath could become
> non-atomic too :)
> 
> Jason

Thanks for the feedback! I've not tried that but I certainly can.

-Aaron

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [non-nasa source] Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-20 17:40     ` Aaron Knister
@ 2018-08-20 19:20       ` Aaron Knister
  2018-08-23 23:25         ` Aaron Knister
  0 siblings, 1 reply; 11+ messages in thread
From: Aaron Knister @ 2018-08-20 19:20 UTC (permalink / raw)
  To: Jason Gunthorpe, Erez Shitrit
  Cc: RDMA mailing list, stable, Ira Weiny, John Fleck, Feras Daoud

It seems to fix the issue based on tests with my synthetic reproducer which consists of two carefully placed sleeps-- one in start_xmit and another before cm_rep_handler to force open the race window. In order to test it organically I need to commandeer many hundreds of nodes in our cluster which can be fairly disruptive to our user community. I'll send over v3 as an RFC and it would be great to get feedback about whether or not the patch is acceptable. If it is, I'll work on scheduling an at-scale test after which I'll submit v3. I think the odds are very low the synthetic reproducer would indicate this problem as fixed but the real world test would still experience the problem. I try to be thorough, though.

-Aaron

On 8/20/18 1:40 PM, Aaron Knister wrote:
> On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
>> On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
>>> Hi,
>>>
>>> Did you check the option to hold the netif_tx_lock_xxx() in
>>> ipoib_cm_rep_handler function (over the line
>>> set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
>>
>> That does seem better, then the test_bit in the datapath could become
>> non-atomic too :)
>>
>> Jason
> 
> Thanks for the feedback! I've not tried that but I certainly can.
> 
> -Aaron
> 

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-20 19:20       ` [non-nasa source] " Aaron Knister
@ 2018-08-23 23:25         ` Aaron Knister
  2018-08-24 12:34           ` Aaron Knister
  0 siblings, 1 reply; 11+ messages in thread
From: Aaron Knister @ 2018-08-23 23:25 UTC (permalink / raw)
  To: Jason Gunthorpe, Erez Shitrit
  Cc: RDMA mailing list, stable, Ira Weiny, John Fleck, Feras Daoud

I had a small window to test the v3 patch (applied to the 4.4.120 
kernel) at scale and it does appear to fix the issue, so that's good. 
One area of concern, though, is the testing data suggests (and this 
isn't really scientific at all since there are many variables) the v3 
patch appears to be 10% slower than the v2 patch. The "organic" test 
involves creating a large number of empty files in a GPFS filesystem and 
exacerbating the race condition with GPFS RPC traffic. My sample sizes 
aren't large, though (26 iterations for the v2 patch and 8 for the v3 
patch due to time constraints). I don't really have a preference for 
either approach. What do folks think?

-Aaron

On 8/20/18 3:20 PM, Aaron Knister wrote:
> It seems to fix the issue based on tests with my synthetic reproducer 
> which consists of two carefully placed sleeps-- one in start_xmit and 
> another before cm_rep_handler to force open the race window. In order to 
> test it organically I need to commandeer many hundreds of nodes in our 
> cluster which can be fairly disruptive to our user community. I'll send 
> over v3 as an RFC and it would be great to get feedback about whether or 
> not the patch is acceptable. If it is, I'll work on scheduling an 
> at-scale test after which I'll submit v3. I think the odds are very low 
> the synthetic reproducer would indicate this problem as fixed but the 
> real world test would still experience the problem. I try to be 
> thorough, though.
> 
> -Aaron
> 
> On 8/20/18 1:40 PM, Aaron Knister wrote:
>> On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
>>> On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
>>>> Hi,
>>>>
>>>> Did you check the option to hold the netif_tx_lock_xxx() in
>>>> ipoib_cm_rep_handler function (over the line
>>>> set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
>>>
>>> That does seem better, then the test_bit in the datapath could become
>>> non-atomic too :)
>>>
>>> Jason
>>
>> Thanks for the feedback! I've not tried that but I certainly can.
>>
>> -Aaron
>>
> 

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-23 23:25         ` Aaron Knister
@ 2018-08-24 12:34           ` Aaron Knister
  0 siblings, 0 replies; 11+ messages in thread
From: Aaron Knister @ 2018-08-24 12:34 UTC (permalink / raw)
  To: Jason Gunthorpe, Erez Shitrit
  Cc: RDMA mailing list, stable, Ira Weiny, John Fleck, Feras Daoud

I just had a chance to do additional testing of the v3 patch. Giving it 
30 new test iterations showed it's performance (based on the "organic" 
reproducer) to be on par with and marginally better than the v2 patch. 
With that said, I'll formally submit the v3 patch.

-Aaron

On 8/23/18 7:25 PM, Aaron Knister wrote:
> I had a small window to test the v3 patch (applied to the 4.4.120 
> kernel) at scale and it does appear to fix the issue, so that's good. 
> One area of concern, though, is the testing data suggests (and this 
> isn't really scientific at all since there are many variables) the v3 
> patch appears to be 10% slower than the v2 patch. The "organic" test 
> involves creating a large number of empty files in a GPFS filesystem and 
> exacerbating the race condition with GPFS RPC traffic. My sample sizes 
> aren't large, though (26 iterations for the v2 patch and 8 for the v3 
> patch due to time constraints). I don't really have a preference for 
> either approach. What do folks think?
> 
> -Aaron
> 
> On 8/20/18 3:20 PM, Aaron Knister wrote:
>> It seems to fix the issue based on tests with my synthetic reproducer 
>> which consists of two carefully placed sleeps-- one in start_xmit and 
>> another before cm_rep_handler to force open the race window. In order 
>> to test it organically I need to commandeer many hundreds of nodes in 
>> our cluster which can be fairly disruptive to our user community. I'll 
>> send over v3 as an RFC and it would be great to get feedback about 
>> whether or not the patch is acceptable. If it is, I'll work on 
>> scheduling an at-scale test after which I'll submit v3. I think the 
>> odds are very low the synthetic reproducer would indicate this problem 
>> as fixed but the real world test would still experience the problem. I 
>> try to be thorough, though.
>>
>> -Aaron
>>
>> On 8/20/18 1:40 PM, Aaron Knister wrote:
>>> On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
>>>> On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
>>>>> Hi,
>>>>>
>>>>> Did you check the option to hold the netif_tx_lock_xxx() in
>>>>> ipoib_cm_rep_handler function (over the line
>>>>> set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
>>>>
>>>> That does seem better, then the test_bit in the datapath could become
>>>> non-atomic too :)
>>>>
>>>> Jason
>>>
>>> Thanks for the feedback! I've not tried that but I certainly can.
>>>
>>> -Aaron
>>>
>>
> 

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
  2018-08-17 21:31 [PATCH v2] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
                   ` (2 preceding siblings ...)
  2018-08-20  6:36 ` Erez Shitrit
@ 2018-08-20 11:49 ` Estrin, Alex
  3 siblings, 0 replies; 11+ messages in thread
From: Estrin, Alex @ 2018-08-20 11:49 UTC (permalink / raw)
  To: Aaron Knister, linux-rdma@vger.kernel.org
  Cc: stable@vger.kernel.org, Weiny, Ira, Fleck, John

> 
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
> 
> Here's the code in start_xmit where we check to see if the connection
> is up:
> 
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
> 
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
> 
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
> 
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> race condition. Since odds are the conn should be up most of the time
> (and thus the connection *not* down most of the time) we don't hold the
> lock for the first check attempt to avoid a slowdown from unecessary
> locking for the majority of the packets transmitted during the
> connection's life.
> 
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>

Reviewed-by: Alex Estrin <alex.estrin@intel.com>

> ---
>  drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++-
> ----
>  1 file changed, 39 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 26cde95b..a950c916 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb,
> struct net_device *dev,
>  	spin_unlock_irqrestore(&priv->lock, flags);
>  }
> 
> +static bool defer_neigh_skb(struct sk_buff *skb,
> +			    struct net_device *dev,
> +			    struct ipoib_neigh *neigh,
> +			    struct ipoib_pseudo_header *phdr)
> +{
> +	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> +		push_pseudo_header(skb, phdr->hwaddr);
> +		__skb_queue_tail(&neigh->queue, skb);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +
>  static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
> @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb,
> struct net_device *dev)
>  	struct ipoib_pseudo_header *phdr;
>  	struct ipoib_header *header;
>  	unsigned long flags;
> +	bool deferred_pkt = true;
> 
>  	phdr = (struct ipoib_pseudo_header *) skb->data;
>  	skb_pull(skb, sizeof(*phdr));
> @@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff
> *skb, struct net_device *dev)
>  			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>  			goto unref;
>  		}
> +		/*
> +		 * Re-check ipoib_cm_up with priv->lock held to avoid
> +		 * race condition between start_xmit and skb_dequeue in
> +		 * cm_rep_handler. Since odds are the conn should be up
> +		 * most of the time, we don't hold the lock for the
> +		 * first check above
> +		 */
> +		spin_lock_irqsave(&priv->lock, flags);
> +		if (ipoib_cm_up(neigh)) {
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> +		} else {
> +			deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +		}
> +
> +		goto unref;
>  	} else if (neigh->ah && neigh->ah->valid) {
>  		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
>  						IPOIB_QPN(phdr->hwaddr));
> @@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff
> *skb, struct net_device *dev)
>  		neigh_refresh_path(neigh, phdr->hwaddr, dev);
>  	}
> 
> -	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> -		push_pseudo_header(skb, phdr->hwaddr);
> -		spin_lock_irqsave(&priv->lock, flags);
> -		__skb_queue_tail(&neigh->queue, skb);
> -		spin_unlock_irqrestore(&priv->lock, flags);
> -	} else {
> +	spin_lock_irqsave(&priv->lock, flags);
> +	deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
> +	spin_unlock_irqrestore(&priv->lock, flags);
> +
> +unref:
> +	if (!deferred_pkt) {
>  		++dev->stats.tx_dropped;
>  		dev_kfree_skb_any(skb);
>  	}
> 
> -unref:
>  	ipoib_neigh_put(neigh);
> 
>  	return NETDEV_TX_OK;
> --
> 2.12.3

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-08-24 12:34 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-08-17 21:31 [PATCH v2] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
2018-08-19 17:28 ` Leon Romanovsky
2018-08-20  4:38   ` Weiny, Ira
2018-08-20  4:34 ` Weiny, Ira
2018-08-20  6:36 ` Erez Shitrit
2018-08-20 16:28   ` Jason Gunthorpe
2018-08-20 17:40     ` Aaron Knister
2018-08-20 19:20       ` [non-nasa source] " Aaron Knister
2018-08-23 23:25         ` Aaron Knister
2018-08-24 12:34           ` Aaron Knister
2018-08-20 11:49 ` Estrin, Alex

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox