All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] NVMe: Fix possible scheduling while atomic error
@ 2016-05-17 21:37 Keith Busch
  2016-05-23 10:58 ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2016-05-17 21:37 UTC (permalink / raw)


Stopping a h/w queue could have scheduled the task to wait on cancelling
work. This patch removes stopping the the delayed work so we can safely
hold rcu_read_lock() while stopping h/w queues.

Since blk-mq requeue work may restart the h/w queues during a reset,
the nvme driver's queue_rq will stop them again if the driver receives
a command during the reset window.

Reported-by: Ming Lin <mlin at kernel.org>
[fixes 0bf77e9 nvme: switch to RCU freeing the namespace]
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 drivers/nvme/host/core.c | 6 ++++--
 drivers/nvme/host/pci.c  | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1a51584..1e05abf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1823,7 +1823,6 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
 		spin_unlock_irq(ns->queue->queue_lock);
 
-		blk_mq_cancel_requeue_work(ns->queue);
 		blk_mq_stop_hw_queues(ns->queue);
 	}
 	rcu_read_unlock();
@@ -1836,7 +1835,10 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
-		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
+		spin_lock_irq(ns->queue->queue_lock);
+		queue_flag_clear(QUEUE_FLAG_STOPPED, ns->queue);
+		spin_unlock_irq(ns->queue->queue_lock);
+
 		blk_mq_start_stopped_hw_queues(ns->queue, true);
 		blk_mq_kick_requeue_list(ns->queue);
 	}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8356813..9693629 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -609,6 +609,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	spin_unlock_irq(&nvmeq->q_lock);
 	return BLK_MQ_RQ_QUEUE_OK;
 out:
+	if (ret == BLK_MQ_RQ_QUEUE_BUSY) {
+		spin_lock_irq(ns->queue->queue_lock);
+		if (blk_queue_stopped(req->q))
+			blk_mq_stop_hw_queues(ns->queue);
+		spin_unlock_irq(ns->queue->queue_lock);
+	}
 	nvme_free_iod(dev, req);
 	return ret;
 }
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-17 21:37 [PATCH] NVMe: Fix possible scheduling while atomic error Keith Busch
@ 2016-05-23 10:58 ` Christoph Hellwig
  2016-05-23 13:41   ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-05-23 10:58 UTC (permalink / raw)


On Tue, May 17, 2016@03:37:42PM -0600, Keith Busch wrote:
>  		spin_unlock_irq(ns->queue->queue_lock);
>  
> -		blk_mq_cancel_requeue_work(ns->queue);
>  		blk_mq_stop_hw_queues(ns->queue);
>  	}
>  	rcu_read_unlock();
> @@ -1836,7 +1835,10 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
>  
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> -		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
> +		spin_lock_irq(ns->queue->queue_lock);
> +		queue_flag_clear(QUEUE_FLAG_STOPPED, ns->queue);
> +		spin_unlock_irq(ns->queue->queue_lock);

What's the rationale for this change? 

> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -609,6 +609,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	spin_unlock_irq(&nvmeq->q_lock);
>  	return BLK_MQ_RQ_QUEUE_OK;
>  out:
> +	if (ret == BLK_MQ_RQ_QUEUE_BUSY) {
> +		spin_lock_irq(ns->queue->queue_lock);
> +		if (blk_queue_stopped(req->q))
> +			blk_mq_stop_hw_queues(ns->queue);
> +		spin_unlock_irq(ns->queue->queue_lock);

Shouldn't we do this where set the stopped flag on the queue instead?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-23 10:58 ` Christoph Hellwig
@ 2016-05-23 13:41   ` Keith Busch
  2016-05-23 14:36     ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2016-05-23 13:41 UTC (permalink / raw)


On Mon, May 23, 2016@03:58:07AM -0700, Christoph Hellwig wrote:
> On Tue, May 17, 2016@03:37:42PM -0600, Keith Busch wrote:
> >  	rcu_read_lock();
> >  	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> > -		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
> > +		spin_lock_irq(ns->queue->queue_lock);
> > +		queue_flag_clear(QUEUE_FLAG_STOPPED, ns->queue);
> > +		spin_unlock_irq(ns->queue->queue_lock);
> 
> What's the rationale for this change? 

That was to make it so nvme_queue_rq wouldn't see a stopped flag just
before it was starting.
 
> > @@ -609,6 +609,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  	spin_unlock_irq(&nvmeq->q_lock);
> >  	return BLK_MQ_RQ_QUEUE_OK;
> >  out:
> > +	if (ret == BLK_MQ_RQ_QUEUE_BUSY) {
> > +		spin_lock_irq(ns->queue->queue_lock);
> > +		if (blk_queue_stopped(req->q))
> > +			blk_mq_stop_hw_queues(ns->queue);
> > +		spin_unlock_irq(ns->queue->queue_lock);
> 
> Shouldn't we do this where set the stopped flag on the queue instead?

It's moved out of that function holding the rcu read lock since
blk_mq_stop_hw_queues potentially sleeps.

But the above is also broken because it's holding a spinlock, so this
patch is no good.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-23 13:41   ` Keith Busch
@ 2016-05-23 14:36     ` Christoph Hellwig
  2016-05-23 14:55       ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-05-23 14:36 UTC (permalink / raw)


On Mon, May 23, 2016@09:41:16AM -0400, Keith Busch wrote:
> On Mon, May 23, 2016@03:58:07AM -0700, Christoph Hellwig wrote:
> > On Tue, May 17, 2016@03:37:42PM -0600, Keith Busch wrote:
> > >  	rcu_read_lock();
> > >  	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> > > -		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
> > > +		spin_lock_irq(ns->queue->queue_lock);
> > > +		queue_flag_clear(QUEUE_FLAG_STOPPED, ns->queue);
> > > +		spin_unlock_irq(ns->queue->queue_lock);
> > 
> > What's the rationale for this change? 
> 
> That was to make it so nvme_queue_rq wouldn't see a stopped flag just
> before it was starting.

I don't really see how that helps us with any race..

> > > @@ -609,6 +609,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
> > >  	spin_unlock_irq(&nvmeq->q_lock);
> > >  	return BLK_MQ_RQ_QUEUE_OK;
> > >  out:
> > > +	if (ret == BLK_MQ_RQ_QUEUE_BUSY) {
> > > +		spin_lock_irq(ns->queue->queue_lock);
> > > +		if (blk_queue_stopped(req->q))
> > > +			blk_mq_stop_hw_queues(ns->queue);
> > > +		spin_unlock_irq(ns->queue->queue_lock);
> > 
> > Shouldn't we do this where set the stopped flag on the queue instead?
> 
> It's moved out of that function holding the rcu read lock since
> blk_mq_stop_hw_queues potentially sleeps.

blk_mq_stop_hw_queues is a loop around blk_mq_stop_hw_queue,
which itself does two calls to cancel_delayed_work and a set_bit.
None of them can block.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-23 14:36     ` Christoph Hellwig
@ 2016-05-23 14:55       ` Keith Busch
  2016-05-24 18:43         ` Ming Lin
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2016-05-23 14:55 UTC (permalink / raw)


On Mon, May 23, 2016@07:36:31AM -0700, Christoph Hellwig wrote:
> On Mon, May 23, 2016@09:41:16AM -0400, Keith Busch wrote:
> > On Mon, May 23, 2016@03:58:07AM -0700, Christoph Hellwig wrote:
> > > On Tue, May 17, 2016@03:37:42PM -0600, Keith Busch wrote:
> > > >  	rcu_read_lock();
> > > >  	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> > > > -		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
> > > > +		spin_lock_irq(ns->queue->queue_lock);
> > > > +		queue_flag_clear(QUEUE_FLAG_STOPPED, ns->queue);
> > > > +		spin_unlock_irq(ns->queue->queue_lock);
> > > 
> > > What's the rationale for this change? 
> > 
> > That was to make it so nvme_queue_rq wouldn't see a stopped flag just
> > before it was starting.
> 
> I don't really see how that helps us with any race..

Without the additional spin lock protecting QUEUE_FLAG_STOPPED:

  CPU A                                      CPU B
  -----                                      -----
  nvme_start_queues                          nvme_queue_rq
                                              if (nvmeq->cq_vector < 0) <--  queue is disabled during reset, returning BLK_MQ_RQ_QUEUE_BUSY
                                                 goto out;
                                              if (blk_queue_stopped())  <--  returns true
    queue_flag_clear
    blk_mq_start_stopped_hw_queues
                                                blk_mq_stop_hw_queues() <--  incorrectly stops h/w queues

Locking the queue in nvme_start_queues before clearing the flag fixes
that up.


> > It's moved out of that function holding the rcu read lock since
> > blk_mq_stop_hw_queues potentially sleeps.
> 
> blk_mq_stop_hw_queues is a loop around blk_mq_stop_hw_queue,
> which itself does two calls to cancel_delayed_work and a set_bit.
> None of them can block.

Oops, I'm crossing wires and mixing up what API's do what ... need to
wake up before starting work.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-23 14:55       ` Keith Busch
@ 2016-05-24 18:43         ` Ming Lin
  2016-05-24 19:59           ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Ming Lin @ 2016-05-24 18:43 UTC (permalink / raw)


On Mon, May 23, 2016@7:55 AM, Keith Busch <keith.busch@intel.com> wrote:
>
>> > It's moved out of that function holding the rcu read lock since
>> > blk_mq_stop_hw_queues potentially sleeps.
>>
>> blk_mq_stop_hw_queues is a loop around blk_mq_stop_hw_queue,
>> which itself does two calls to cancel_delayed_work and a set_bit.
>> None of them can block.

It's blk_mq_cancel_requeue_work() that potentially sleeps.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-24 18:43         ` Ming Lin
@ 2016-05-24 19:59           ` Keith Busch
  2016-05-25  8:18             ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2016-05-24 19:59 UTC (permalink / raw)


On Tue, May 24, 2016@11:43:21AM -0700, Ming Lin wrote:
> 
> It's blk_mq_cancel_requeue_work() that potentially sleeps.

Right, I don't know what I was thinking... 

AFAICT, the patch is fine as-is if Christoph is okay with it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-24 19:59           ` Keith Busch
@ 2016-05-25  8:18             ` Christoph Hellwig
  2016-05-25 17:57               ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-05-25  8:18 UTC (permalink / raw)


On Tue, May 24, 2016@03:59:21PM -0400, Keith Busch wrote:
> On Tue, May 24, 2016@11:43:21AM -0700, Ming Lin wrote:
> > 
> > It's blk_mq_cancel_requeue_work() that potentially sleeps.
> 
> Right, I don't know what I was thinking... 
> 
> AFAICT, the patch is fine as-is if Christoph is okay with it.

I still don't why we need to delay the call to blk_mq_stop_hw_queues
to ->queue_rq.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-25  8:18             ` Christoph Hellwig
@ 2016-05-25 17:57               ` Keith Busch
  2016-05-27  7:40                 ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2016-05-25 17:57 UTC (permalink / raw)


On Wed, May 25, 2016@01:18:42AM -0700, Christoph Hellwig wrote:
> On Tue, May 24, 2016@03:59:21PM -0400, Keith Busch wrote:
> > On Tue, May 24, 2016@11:43:21AM -0700, Ming Lin wrote:
> > > 
> > > It's blk_mq_cancel_requeue_work() that potentially sleeps.
> > 
> > Right, I don't know what I was thinking... 
> > 
> > AFAICT, the patch is fine as-is if Christoph is okay with it.
> 
> I still don't why we need to delay the call to blk_mq_stop_hw_queues
> to ->queue_rq.

Eh? It's still stopped in the same place as always, so it's not
delayed. We just call it again from queue_rq because requeue_work can
start h/w queues on its own. This is very similar to scsi's queue_rq
in scsi_lib.c.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-25 17:57               ` Keith Busch
@ 2016-05-27  7:40                 ` Christoph Hellwig
  2016-06-08 11:17                   ` Sagi Grimberg
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2016-05-27  7:40 UTC (permalink / raw)


On Wed, May 25, 2016@01:57:37PM -0400, Keith Busch wrote:
> > > On Tue, May 24, 2016@11:43:21AM -0700, Ming Lin wrote:
> > > > 
> > > > It's blk_mq_cancel_requeue_work() that potentially sleeps.
> > > 
> > > Right, I don't know what I was thinking... 
> > > 
> > > AFAICT, the patch is fine as-is if Christoph is okay with it.
> > 
> > I still don't why we need to delay the call to blk_mq_stop_hw_queues
> > to ->queue_rq.
> 
> Eh? It's still stopped in the same place as always, so it's not
> delayed. We just call it again from queue_rq because requeue_work can
> start h/w queues on its own. This is very similar to scsi's queue_rq
> in scsi_lib.c.

The rationale in SCSI is that we stop the queue when we are above
the hosts or targets queuing limit, and only I/O ocmpletions will
wake it up again.  Kicking of the sotpped queues in blk_mq_requeue_work
is SCSI specific behavior that leaked into the core, and it's my fault..

Well' let's get the fix in as-is for now, but I really need to sort
this out properly, so maybe add some comments.

> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
---end quoted text---

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-05-27  7:40                 ` Christoph Hellwig
@ 2016-06-08 11:17                   ` Sagi Grimberg
  2016-06-08 14:43                     ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Sagi Grimberg @ 2016-06-08 11:17 UTC (permalink / raw)



>>>> AFAICT, the patch is fine as-is if Christoph is okay with it.
>>>
>>> I still don't why we need to delay the call to blk_mq_stop_hw_queues
>>> to ->queue_rq.
>>
>> Eh? It's still stopped in the same place as always, so it's not
>> delayed. We just call it again from queue_rq because requeue_work can
>> start h/w queues on its own. This is very similar to scsi's queue_rq
>> in scsi_lib.c.
>
> The rationale in SCSI is that we stop the queue when we are above
> the hosts or targets queuing limit, and only I/O ocmpletions will
> wake it up again.  Kicking of the sotpped queues in blk_mq_requeue_work
> is SCSI specific behavior that leaked into the core, and it's my fault..
>
> Well' let's get the fix in as-is for now, but I really need to sort
> this out properly, so maybe add some comments.

I really don't like this patch (sorry), having queue_rq being aware
of what requeue_work *might* do looks backwards to me...
now I'm thinking what I need to do in fabrics rdma and loop. This is
why I didn't like the NVME_NS_DEAD check in queue_rq as well.

I'd really prefer to not propagate this backwards logic into
other transports...

But, if this is mandatory for now, does this patch makes sense for
rdma?
-- 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 938e7d55c4a8..60aaa1b4d021 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1450,6 +1450,12 @@ static int nvme_rdma_queue_rq(struct 
blk_mq_hw_ctx *hctx,
                 goto err;
         }

+       /* XXX: This is really not the correct layer to check this */
+       if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) {
+               ret = -EAGAIN;
+               goto err;
+       }
+
         ib_dma_sync_single_for_device(dev, sqe->dma,
                         sizeof(struct nvme_command), DMA_TO_DEVICE);

@@ -1464,8 +1470,14 @@ static int nvme_rdma_queue_rq(struct 
blk_mq_hw_ctx *hctx,

         return BLK_MQ_RQ_QUEUE_OK;
  err:
-       return (ret == -ENOMEM || ret == -EAGAIN) ?
-               BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
+       if (ret == -ENOMEM || ret == -EAGAIN) {
+               spin_lock_irq(ns->queue->queue_lock);
+               if (blk_queue_stopped(rq->q))
+                       blk_mq_stop_hw_queues(ns->queue);
+               spin_unlock_irq(ns->queue->queue_lock);
+               return BLK_MQ_RQ_QUEUE_BUSY;
+       }
+       return BLK_MQ_RQ_QUEUE_ERROR;
  }

  static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-- 

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] NVMe: Fix possible scheduling while atomic error
  2016-06-08 11:17                   ` Sagi Grimberg
@ 2016-06-08 14:43                     ` Keith Busch
  0 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2016-06-08 14:43 UTC (permalink / raw)


On Wed, Jun 08, 2016@02:17:03PM +0300, Sagi Grimberg wrote:
> I really don't like this patch (sorry), having queue_rq being aware
> of what requeue_work *might* do looks backwards to me...
> now I'm thinking what I need to do in fabrics rdma and loop. This is
> why I didn't like the NVME_NS_DEAD check in queue_rq as well.

I agree, but I didn't find another card we can play to get the right
sequence.
 
> I'd really prefer to not propagate this backwards logic into
> other transports...
> 
> But, if this is mandatory for now, does this patch makes sense for
> rdma?

Looks right to me.


> -- 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 938e7d55c4a8..60aaa1b4d021 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1450,6 +1450,12 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx
> *hctx,
>                 goto err;
>         }
> 
> +       /* XXX: This is really not the correct layer to check this */
> +       if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) {
> +               ret = -EAGAIN;
> +               goto err;
> +       }
> +
>         ib_dma_sync_single_for_device(dev, sqe->dma,
>                         sizeof(struct nvme_command), DMA_TO_DEVICE);
> 
> @@ -1464,8 +1470,14 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx
> *hctx,
> 
>         return BLK_MQ_RQ_QUEUE_OK;
>  err:
> -       return (ret == -ENOMEM || ret == -EAGAIN) ?
> -               BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
> +       if (ret == -ENOMEM || ret == -EAGAIN) {
> +               spin_lock_irq(ns->queue->queue_lock);
> +               if (blk_queue_stopped(rq->q))
> +                       blk_mq_stop_hw_queues(ns->queue);
> +               spin_unlock_irq(ns->queue->queue_lock);
> +               return BLK_MQ_RQ_QUEUE_BUSY;
> +       }
> +       return BLK_MQ_RQ_QUEUE_ERROR;
>  }
> 
>  static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
> -- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-06-08 14:43 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-05-17 21:37 [PATCH] NVMe: Fix possible scheduling while atomic error Keith Busch
2016-05-23 10:58 ` Christoph Hellwig
2016-05-23 13:41   ` Keith Busch
2016-05-23 14:36     ` Christoph Hellwig
2016-05-23 14:55       ` Keith Busch
2016-05-24 18:43         ` Ming Lin
2016-05-24 19:59           ` Keith Busch
2016-05-25  8:18             ` Christoph Hellwig
2016-05-25 17:57               ` Keith Busch
2016-05-27  7:40                 ` Christoph Hellwig
2016-06-08 11:17                   ` Sagi Grimberg
2016-06-08 14:43                     ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.