Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH v4 3/5] blk-mq: export helpers
From: Omar Sandoval @ 2017-04-14  8:00 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team
In-Reply-To: <cover.1492156558.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

blk_mq_finish_request() is required for schedulers that define their own
put_request(). blk_mq_run_hw_queue() is required for schedulers that
hold back requests to be run later.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e536dacfae4c..7138cd98146e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -368,6 +368,7 @@ void blk_mq_finish_request(struct request *rq)
 {
 	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
 
 void blk_mq_free_request(struct request *rq)
 {
@@ -1183,6 +1184,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	__blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b90c3d5766cd..d75de612845d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -238,6 +238,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
2.12.2

^ permalink raw reply related

* [PATCH v4 1/5] sbitmap: add sbitmap_get_shallow() operation
From: Omar Sandoval @ 2017-04-14  7:59 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team
In-Reply-To: <cover.1492156558.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

This operation supports the use case of limiting the number of bits that
can be allocated for a given operation. Rather than setting aside some
bits at the end of the bitmap, we can set aside bits in each word of the
bitmap. This means we can keep the allocation hints spread out and
support sbitmap_resize() nicely at the cost of lower granularity for the
allowed depth.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 include/linux/sbitmap.h | 55 ++++++++++++++++++++++++++++++++++++
 lib/sbitmap.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d4e0a204c118..a1904aadbc45 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -176,6 +176,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
 
 /**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth);
+
+/**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
  * @sb: Bitmap to check.
  *
@@ -326,6 +345,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
 /**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth);
+
+/**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
  * @sbq: Bitmap queue to allocate from.
@@ -346,6 +378,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 }
 
 /**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+					    unsigned int *cpu,
+					    unsigned int shallow_depth)
+{
+	int nr;
+
+	*cpu = get_cpu();
+	nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+	put_cpu();
+	return nr;
+}
+
+/**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 60e800e0b5a0..80aa8d5463fa 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
-			      bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+			      unsigned int hint, bool wrap)
 {
 	unsigned int orig_hint = hint;
 	int nr;
 
 	while (1) {
-		nr = find_next_zero_bit(&word->word, word->depth, hint);
-		if (unlikely(nr >= word->depth)) {
+		nr = find_next_zero_bit(word, depth, hint);
+		if (unlikely(nr >= depth)) {
 			/*
 			 * We started with an offset, and we didn't reset the
 			 * offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
 			return -1;
 		}
 
-		if (!test_and_set_bit(nr, &word->word))
+		if (!test_and_set_bit(nr, word))
 			break;
 
 		hint = nr + 1;
-		if (hint >= word->depth - 1)
+		if (hint >= depth - 1)
 			hint = 0;
 	}
 
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index],
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth,
 					SB_NR_TO_BIT(sb, alloc_hint),
 					!round_robin);
 		if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 }
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					min(sb->map[index].depth, shallow_depth),
+					SB_NR_TO_BIT(sb, alloc_hint), true);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
 {
 	unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth)
+{
+	unsigned int hint, depth;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sbq->sb.depth);
+	if (unlikely(hint >= depth)) {
+		hint = depth ? prandom_u32() % depth : 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+	nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
-- 
2.12.2

^ permalink raw reply related

* [PATCH v4 2/5] blk-mq: add shallow depth option for blk_mq_get_tag()
From: Omar Sandoval @ 2017-04-14  7:59 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team
In-Reply-To: <cover.1492156558.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

Wire up the sbitmap_get_shallow() operation to the tag code so that a
caller can limit the number of tags available to it.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq-tag.c | 5 ++++-
 block/blk-mq.h     | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9d97bfc4d465..d0be72ccb091 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 	if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
 	    !hctx_may_queue(data->hctx, bt))
 		return -1;
-	return __sbitmap_queue_get(bt);
+	if (data->shallow_depth)
+		return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+	else
+		return __sbitmap_queue_get(bt);
 }
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7e6f2e467696..524f44742816 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -141,6 +141,7 @@ struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
 	unsigned int flags;
+	unsigned int shallow_depth;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
-- 
2.12.2

^ permalink raw reply related

* [PATCH v4 0/5] blk-mq: Kyber multiqueue I/O scheduler
From: Omar Sandoval @ 2017-04-14  7:59 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team

From: Omar Sandoval <osandov@fb.com>

This is v4 of Kyber, an I/O scheduler for multiqueue devices combining
several techniques: the scalable bitmap library, the new blk-stats API,
and queue depth throttling similar to blk-wbt. v1 was here [1], v2 was
here [2], v3 was here [3].

v4 fixes a hang in v3 caused by a race condition in the wait queue
handling in kyber_get_domain_token().

This series is based on block/for-next. Patches 1 and 2 implement a new
sbitmap operation. Patch 3 exports a couple of helpers. Patch 4 moves a
scheduler callback to somewhere more useful. Patch 5 implements the new
scheduler.

Thanks!

1: http://marc.info/?l=linux-block&m=148978871820916&w=2
2: http://marc.info/?l=linux-block&m=149132467510945&w=2
3: http://marc.info/?l=linux-block&m=149180713919341&w=2

Omar Sandoval (5):
  sbitmap: add sbitmap_get_shallow() operation
  blk-mq: add shallow depth option for blk_mq_get_tag()
  blk-mq: export helpers
  blk-mq-sched: make completed_request() callback more useful
  blk-mq: introduce Kyber multiqueue I/O scheduler

 Documentation/block/kyber-iosched.txt |  14 +
 block/Kconfig.iosched                 |   9 +
 block/Makefile                        |   1 +
 block/blk-mq-sched.h                  |  11 +-
 block/blk-mq-tag.c                    |   5 +-
 block/blk-mq.c                        |   7 +-
 block/blk-mq.h                        |   1 +
 block/kyber-iosched.c                 | 719 ++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h                |   1 +
 include/linux/elevator.h              |   2 +-
 include/linux/sbitmap.h               |  55 +++
 lib/sbitmap.c                         |  75 +++-
 12 files changed, 882 insertions(+), 18 deletions(-)
 create mode 100644 Documentation/block/kyber-iosched.txt
 create mode 100644 block/kyber-iosched.c

-- 
2.12.2

^ permalink raw reply

* Re: [PATCH 1/6] blk-mq: Do not invoke queue operations on a dead queue
From: Omar Sandoval @ 2017-04-14  7:40 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: hare@suse.com, linux-block@vger.kernel.org, osandov@fb.com,
	axboe@kernel.dk
In-Reply-To: <1492124731.2723.1.camel@sandisk.com>

On Thu, Apr 13, 2017 at 11:05:32PM +0000, Bart Van Assche wrote:
> On Thu, 2017-04-13 at 16:01 -0700, Omar Sandoval wrote:
> > On Tue, Apr 11, 2017 at 01:58:37PM -0700, Bart Van Assche wrote:
> > > The blk-mq debugfs attributes are removed after blk_cleanup_queue()
> > > has finished. Since running a queue after a queue has entered the
> > > "dead" state is not allowed, disallow this. This patch avoids that
> > > an attempt to run a dead queue triggers a kernel crash.
> > > 
> > > Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> > > Cc: Omar Sandoval <osandov@fb.com>
> > > Cc: Hannes Reinecke <hare@suse.com>
> > > ---
> > >  block/blk-mq-debugfs.c | 8 ++++++++
> > >  1 file changed, 8 insertions(+)
> > > 
> > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> > > index df9b688b877c..a1ce823578c7 100644
> > > --- a/block/blk-mq-debugfs.c
> > > +++ b/block/blk-mq-debugfs.c
> > > @@ -111,6 +111,14 @@ static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
> > >  	struct request_queue *q = file_inode(file)->i_private;
> > >  	char op[16] = { }, *s;
> > >  
> > > +	/*
> > > +	 * The debugfs attributes are removed after blk_cleanup_queue() has
> > > +	 * called blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set
> > > +	 * to avoid triggering a use-after-free.
> > > +	 */
> > > +	if (blk_queue_dead(q))
> > > +		return -ENOENT;
> > > +
> > >  	len = min(len, sizeof(op) - 1);
> > >  	if (copy_from_user(op, ubuf, len))
> > >  		return -EFAULT;
> > 
> > Looking at this, I think we have similar issues with most of the other
> > debugfs files. Should we move the debugfs cleanup earlier?
> 
> Hello Omar,
> 
> That's a good question. However, while I was debugging it was very convenient
> to be able to access the queue state after it had reached the "dead" state.
> Performing the cleanup earlier would be an alternative solution but would
> make debugging a bit harder ...
> 
> Bart.

What useful information were you getting out of debugfs once the queue
was already dead? Wasn't the interesting stuff freed at that point?

^ permalink raw reply

* Re: RFC: drop the T10 OSD code and its users
From: Martin K. Petersen @ 2017-04-14  2:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: ooo, martin.petersen, trond.myklebust, axboe, osd-dev, linux-nfs,
	linux-scsi, linux-block, linux-kernel
In-Reply-To: <20170412160109.10598-1-hch@lst.de>


Christoph,

> The only real user of the T10 OSD protocol, the pNFS object layout
> driver never went to the point of having shipping products, and the
> other two users (osdblk and exofs) were simple example of it's usage.
>
> The code has been mostly unmaintained for years and is getting in the
> way of block / SCSI changes, so I think it's finally time to drop it.

No particular objections from me.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: remove REQ_OP_WRITE_SAME
From: Martin K. Petersen @ 2017-04-14  2:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel, linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170412084809.8245-1-hch@lst.de>

Christoph,

> Now that we are using REQ_OP_WRITE_ZEROES for all zeroing needs in the
> kernel there is very little use left for REQ_OP_WRITE_SAME.  We only
> have two callers left, and both just export optional protocol features
> to remote systems: DRBD and the target code.

While I'm not particularly married to WRITE SAME, I do think it's a
shame that the RAID5/6 code never started using it. It does make a
difference when initializing RAID sets.

The other thing that keeps me a bit on the fence is that a bunch of the
plumbing to handle a bio with a payload different from bi_size is needed
for the copy offload token. I'm hoping to have those patches ready for
4.13. Right now there are a bunch of places where handling of
REQ_OP_COPY_IN and REQ_OP_COPY_OUT share conditionals with WRITE SAME.

So I suggest postponing the decision about whether to rip out WRITE SAME
until I finish the token stuff.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH] block: fix bio_will_gap()
From: Ming Lei @ 2017-04-14  2:04 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: axboe@fb.com, hch@infradead.org, linux-block@vger.kernel.org,
	jthumshirn@suse.de, jth@kernel.org
In-Reply-To: <1492100540.5176.6.camel@sandisk.com>

On Thu, Apr 13, 2017 at 04:22:22PM +0000, Bart Van Assche wrote:
> On Fri, 2017-04-14 at 00:06 +0800, Ming Lei wrote:
> > But if one rq starts with non-aligned buffer(the 1st bvec's
> > bv_offset isn't zero) and if we allow to merge, it is quite
> > difficult to respect sg gap limit, especially the segment
> > can't be at maximum segment size, otherwise the segment
> > ends in unaligned virt boundary. This patch trys to avoid the
> > issue by not allowing to merge if the req starts with non-aligned
> > buffer.
> 
> Hello Ming,
> 
> Why is it considered difficult to detect whether or not a gap exists if
> the offset of the first bio is non-zero? Please note that a thoroughly
> tested version of gap detection code that supports non-zero offsets for
> the first element is already upstream. See also ib_map_mr_sg() and
> ib_sg_to_pages() in drivers/infiniband/core/verbs.c.

I don't know infiniband much, but from the code, it is just about
sg, and looks not same with block's sg gap limit.

The sg gap limit in block layer actually means:
	- except for last segment, every segment has to end in aligned virt boundary
	- from the 2nd segment, offset of the segment has to be zero

But we don't store main segment info(start, size) in bio/req,
all the segments are basically computed in the flight, so it isn't
easy to check/handle sg gap. Also IMO segment handling of block layer is
quite fragile, and hope multi-page bvec can improve the case much.

We relax check for sg gap for allowing merging tons of small bios, if
these bios are physically contiguous. But if offset of the 1st bio
in the merged req isn't zero, current segment compution in
__blk_segment_map_sg() doesn't consider sg gap, so one segment may
ends in unaligned virt boundary.

Given it is late in v4.11 cycle, this patch doesn't want to touch
__blk_segment_map_sg() for avoiding regression risk, instead just
changing bio_will_gap() for avoding the case with minimized risk,
because the change is quite simple.

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Ming Lei @ 2017-04-14  1:13 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, snitzer@redhat.com, axboe@kernel.dk
In-Reply-To: <25ebc19f-f6e0-2f71-f590-8416359d2fd1@sandisk.com>

On Thu, Apr 13, 2017 at 09:59:57AM -0700, Bart Van Assche wrote:
> On 04/12/17 19:20, Ming Lei wrote:
> > On Wed, Apr 12, 2017 at 06:38:07PM +0000, Bart Van Assche wrote:
> >> If the blk-mq core would always rerun a hardware queue if a block driver
> >> returns BLK_MQ_RQ_QUEUE_BUSY then that would cause 100% of a single CPU core
> > 
> > It won't casue 100% CPU utilization since we restart queue in completion
> > path and at that time at least one tag is available, then progress can be
> > made.
> 
> Hello Ming,
> 
> Sorry but you are wrong. If .queue_rq() returns BLK_MQ_RQ_QUEUE_BUSY
> then it's likely that calling .queue_rq() again after only a few
> microseconds will cause it to return BLK_MQ_RQ_QUEUE_BUSY again. If you
> don't believe me, change "if (!blk_mq_sched_needs_restart(hctx) &&
> !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) blk_mq_run_hw_queue(hctx,
> true);" into "blk_mq_run_hw_queue(hctx, true);", trigger a busy

Yes, that can be true, but I mean it is still OK to run the queue again
with

	if (!blk_mq_sched_needs_restart(hctx) &&
	    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
			blk_mq_run_hw_queue(hctx, true);

and restarting queue in __blk_mq_finish_request() when
BLK_MQ_RQ_QUEUE_BUSY is returned from .queue_rq(). And both are in current
blk-mq implementation.

Then why do we need blk_mq_delay_run_hw_queue(hctx, 100/*ms*/) in dm?

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH 5/6] blk-mq: Add blk_mq_ops.show_rq()
From: Omar Sandoval @ 2017-04-13 23:21 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-6-bart.vanassche@sandisk.com>

On Tue, Apr 11, 2017 at 01:58:41PM -0700, Bart Van Assche wrote:
> This new callback function will be used in the next patch to show
> more information about SCSI requests.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Omar Sandoval <osandov@fb.com>
> Cc: Hannes Reinecke <hare@suse.com>
> ---
>  block/blk-mq-debugfs.c | 10 ++++++++--
>  include/linux/blk-mq.h |  6 ++++++
>  2 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 161f30fc236f..6b28d92d4c0e 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -316,7 +316,9 @@ static const char *const rqf_name[] = {
>  static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
>  {
>  	struct request *rq = list_entry_rq(v);
> +	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
>  	const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
> +	char drv_info[200];
>  
>  	seq_printf(m, "%p {.op=", rq);
>  	if (op < ARRAY_SIZE(op_name) && op_name[op])
> @@ -329,8 +331,12 @@ static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
>  	seq_puts(m, ", .rq_flags=");
>  	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
>  		       ARRAY_SIZE(rqf_name));
> -	seq_printf(m, ", .tag=%d, .internal_tag=%d}\n", rq->tag,
> -		   rq->internal_tag);
> +	if (mq_ops->show_rq)
> +		mq_ops->show_rq(rq, drv_info, sizeof(drv_info));

How about passing the seq_file to the callback instead of this
arbitrarily-sized on-stack buffer?

^ permalink raw reply

* Re: [PATCH 4/6] blk-mq: Show operation, cmd_flags and rq_flags names
From: Omar Sandoval @ 2017-04-13 23:17 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-5-bart.vanassche@sandisk.com>

On Tue, Apr 11, 2017 at 01:58:40PM -0700, Bart Van Assche wrote:
> Show the operation name, .cmd_flags and .rq_flags as names instead
> of numbers.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Omar Sandoval <osandov@fb.com>
> Cc: Hannes Reinecke <hare@suse.com>

I like this :) one minor nit below.

Reviewed-by: Omar Sandoval <osandov@fb.com>

> ---
>  block/blk-mq-debugfs.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index aae4b7c7b7b0..161f30fc236f 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
[snip]
>  static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
>  {
>  	struct request *rq = list_entry_rq(v);
> +	const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
>  
> -	seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
> -		   rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags,
> -		   rq->tag, rq->internal_tag);
> +	seq_printf(m, "%p {.op=", rq);
> +	if (op < ARRAY_SIZE(op_name) && op_name[op])
> +		seq_printf(m, "%s", op_name[op]);
> +	else
> +		seq_printf(m, "%d", op);
> +	seq_puts(m, ", .cmd_flags=");
> +	blk_flags_show(m, rq->cmd_flags ^ op, cmd_flag_name,
                          ^^^^^^^^^^^^^^^^^^
I think rq->cmd_flags & ~REQ_OP_MASK is slightly clearer here, but I
don't feel that strongly about it, it's up to you.

> +		       ARRAY_SIZE(cmd_flag_name));
> +	seq_puts(m, ", .rq_flags=");
> +	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
> +		       ARRAY_SIZE(rqf_name));
> +	seq_printf(m, ", .tag=%d, .internal_tag=%d}\n", rq->tag,
> +		   rq->internal_tag);
>  	return 0;
>  }
>  
> -- 
> 2.12.0
> 

^ permalink raw reply

* RE: [PATCH] block-mq: set both block queue and hardware queue restart bit for restart
From: Long Li @ 2017-04-13 23:12 UTC (permalink / raw)
  To: Bart Van Assche, linux-kernel@vger.kernel.org,
	linux-block@vger.kernel.org, KY Srinivasan, axboe@kernel.dk
  Cc: Stephen Hemminger
In-Reply-To: <1491868058.4199.28.camel@sandisk.com>



> -----Original Message-----
> From: Bart Van Assche [mailto:Bart.VanAssche@sandisk.com]
> Sent: Monday, April 10, 2017 4:48 PM
> To: linux-kernel@vger.kernel.org; linux-block@vger.kernel.org; KY Sriniva=
san
> <kys@microsoft.com>; Long Li <longli@microsoft.com>; axboe@kernel.dk
> Cc: Stephen Hemminger <sthemmin@microsoft.com>
> Subject: Re: [PATCH] block-mq: set both block queue and hardware queue
> restart bit for restart
>=20
> On Thu, 2017-04-06 at 04:21 +0000, KY Srinivasan wrote:
> > > -----Original Message-----
> > > From: Bart Van Assche [mailto:Bart.VanAssche@sandisk.com]
> > > Sent: Wednesday, April 5, 2017 8:46 PM
> > > To: linux-kernel@vger.kernel.org; linux-block@vger.kernel.org; Long
> > > Li <longli@microsoft.com>; axboe@kernel.dk
> > > Cc: Stephen Hemminger <sthemmin@microsoft.com>; KY Srinivasan
> > > <kys@microsoft.com>
> > > Subject: Re: [PATCH] block-mq: set both block queue and hardware
> > > queue restart bit for restart
> > >
> > > On Thu, 2017-04-06 at 03:38 +0000, Long Li wrote:
> > > > > -----Original Message-----
> > > > > From: Bart Van Assche [mailto:Bart.VanAssche@sandisk.com]
> > > > >
> > > > > Please drop this patch. I'm working on a better solution.
> > > >
> > > > Thank you. Looking forward to your patch.
> > >
> > > Hello Long,
> > >
> > > It would help if you could share the name of the block or SCSI
> > > driver with which you ran into that lockup and also if you could
> > > share the name of the I/O scheduler used in your test.
> >
> > The tests that indicated the issue were run Hyper-V. The driver is
> > storvsc_drv.c The I/O scheduler was I think noop.
>=20
> Hello Long and K.Y.,
>=20
> Thank you for the feedback. Can you repeat your test with kernel v4.11-rc=
6?
> The patches that went into the block layer for v4.11-rc6 should be suffic=
ient
> to fix
> this:
>=20
> $ PAGER=3D git log --format=3Dshort v4.11-rc5..v4.11-rc6 block include/li=
nux/blk*
> commit 6d8c6c0f97ad8a3517c42b179c1dc8e77397d0e2
> Author: Bart Van Assche <bart.vanassche@sandisk.com>
>=20
> =A0=A0=A0=A0blk-mq: Restart a single queue if tag sets are shared
>=20
> commit 7587a5ae7eef0439f7be31f1b5959af062bbc5ec
> Author: Bart Van Assche <bart.vanassche@sandisk.com>
>=20
> =A0=A0=A0=A0blk-mq: Introduce blk_mq_delay_run_hw_queue()
>=20
> commit ebe8bddb6e30d7a02775b9972099271fc5910f37
> Author: Omar Sandoval <osandov@fb.com>
>=20
> =A0=A0=A0=A0blk-mq: remap queues when adding/removing hardware queues
>=20
> commit 54d5329d425650fafaf90660a139c771d2d49cae
> Author: Omar Sandoval <osandov@fb.com>
>=20
> =A0=A0=A0=A0blk-mq-sched: fix crash in switch error path
>=20
> commit 93252632e828da3e90241a1c0e766556abf71598
> Author: Omar Sandoval <osandov@fb.com>
>=20
> =A0=A0=A0=A0blk-mq-sched: set up scheduler tags when bringing up new queu=
es
>=20
> commit 6917ff0b5bd4139e08a3f3146529dcb3b95ba7a6
> Author: Omar Sandoval <osandov@fb.com>
>=20
> =A0=A0=A0=A0blk-mq-sched: refactor scheduler initialization
>=20
> commit 81380ca10778b99dce98940cfc993214712df335
> Author: Omar Sandoval <osandov@fb.com>
>=20
> =A0=A0=A0=A0blk-mq: use the right hctx when getting a driver tag fails
>=20
> commit ac77a0c463c1d7d659861f7b6d1261970dd3282a
> Author: Minchan Kim <minchan@kernel.org>
>=20
> =A0=A0=A0=A0block: do not put mq context in blk_mq_alloc_request_hctx
>=20
> Bart.

Thank you. We are doing tests. I will update on the results.

Long

^ permalink raw reply

* Re: [PATCH 3/6] blk-mq: Make blk_flags_show() callers append a newline character
From: Omar Sandoval @ 2017-04-13 23:08 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-4-bart.vanassche@sandisk.com>

On Tue, Apr 11, 2017 at 01:58:39PM -0700, Bart Van Assche wrote:
> This patch does not change any functionality but makes it possible
> to produce a single line of output with multiple flag-to-name
> translations.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Omar Sandoval <osandov@fb.com>
> Cc: Hannes Reinecke <hare@suse.com>

Reviewed-by: Omar Sandoval <osandov@fb.com>

^ permalink raw reply

* Re: [PATCH 1/6] blk-mq: Do not invoke queue operations on a dead queue
From: Bart Van Assche @ 2017-04-13 23:05 UTC (permalink / raw)
  To: osandov@osandov.com
  Cc: hare@suse.com, linux-block@vger.kernel.org, osandov@fb.com,
	axboe@kernel.dk
In-Reply-To: <20170413230102.GA1550@vader.DHCP.thefacebook.com>

On Thu, 2017-04-13 at 16:01 -0700, Omar Sandoval wrote:
> On Tue, Apr 11, 2017 at 01:58:37PM -0700, Bart Van Assche wrote:
> > The blk-mq debugfs attributes are removed after blk_cleanup_queue()
> > has finished. Since running a queue after a queue has entered the
> > "dead" state is not allowed, disallow this. This patch avoids that
> > an attempt to run a dead queue triggers a kernel crash.
> >=20
> > Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> > Cc: Omar Sandoval <osandov@fb.com>
> > Cc: Hannes Reinecke <hare@suse.com>
> > ---
> >  block/blk-mq-debugfs.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >=20
> > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> > index df9b688b877c..a1ce823578c7 100644
> > --- a/block/blk-mq-debugfs.c
> > +++ b/block/blk-mq-debugfs.c
> > @@ -111,6 +111,14 @@ static ssize_t blk_queue_flags_store(struct file *=
file, const char __user *ubuf,
> >  	struct request_queue *q =3D file_inode(file)->i_private;
> >  	char op[16] =3D { }, *s;
> > =20
> > +	/*
> > +	 * The debugfs attributes are removed after blk_cleanup_queue() has
> > +	 * called blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set
> > +	 * to avoid triggering a use-after-free.
> > +	 */
> > +	if (blk_queue_dead(q))
> > +		return -ENOENT;
> > +
> >  	len =3D min(len, sizeof(op) - 1);
> >  	if (copy_from_user(op, ubuf, len))
> >  		return -EFAULT;
>=20
> Looking at this, I think we have similar issues with most of the other
> debugfs files. Should we move the debugfs cleanup earlier?

Hello Omar,

That's a good question. However, while I was debugging it was very convenie=
nt
to be able to access the queue state after it had reached the "dead" state.
Performing the cleanup earlier would be an alternative solution but would
make debugging a bit harder ...

Bart.=

^ permalink raw reply

* Re: [PATCH 1/6] blk-mq: Do not invoke queue operations on a dead queue
From: Omar Sandoval @ 2017-04-13 23:03 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170413230102.GA1550@vader.DHCP.thefacebook.com>

On Thu, Apr 13, 2017 at 04:01:02PM -0700, Omar Sandoval wrote:
> On Tue, Apr 11, 2017 at 01:58:37PM -0700, Bart Van Assche wrote:
> > The blk-mq debugfs attributes are removed after blk_cleanup_queue()
> > has finished. Since running a queue after a queue has entered the
> > "dead" state is not allowed, disallow this. This patch avoids that
> > an attempt to run a dead queue triggers a kernel crash.
> > 
> > Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> > Cc: Omar Sandoval <osandov@fb.com>
> > Cc: Hannes Reinecke <hare@suse.com>
> > ---
> >  block/blk-mq-debugfs.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> > index df9b688b877c..a1ce823578c7 100644
> > --- a/block/blk-mq-debugfs.c
> > +++ b/block/blk-mq-debugfs.c
> > @@ -111,6 +111,14 @@ static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
> >  	struct request_queue *q = file_inode(file)->i_private;
> >  	char op[16] = { }, *s;
> >  
> > +	/*
> > +	 * The debugfs attributes are removed after blk_cleanup_queue() has
> > +	 * called blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set
> > +	 * to avoid triggering a use-after-free.
> > +	 */
> > +	if (blk_queue_dead(q))
> > +		return -ENOENT;
> > +
> >  	len = min(len, sizeof(op) - 1);
> >  	if (copy_from_user(op, ubuf, len))
> >  		return -EFAULT;
> > -- 
> > 2.12.0
> > 
> 
> Hi, Bart,
> 
> Looking at this, I think we have similar issues with most of the other
> debugfs files. Should we move the debugfs cleanup earlier?

In particular, I think we can call blk_mq_debugfs_unregister_hctxs()
(which is somewhat poorly named, as it removes the whole mq directory)
before we call blk_mq_free_queue(). I was under the impression that
that's what it already did, but I think I was wrong.

^ permalink raw reply

* Re: [PATCH 2/6] blk-mq: Move the "state" debugfs attribute one level down
From: Omar Sandoval @ 2017-04-13 23:01 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-3-bart.vanassche@sandisk.com>

On Tue, Apr 11, 2017 at 01:58:38PM -0700, Bart Van Assche wrote:
> Move the "state" attribute from the top level to the "mq" directory
> as requested by Omar.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Omar Sandoval <osandov@fb.com>
> Cc: Hannes Reinecke <hare@suse.com>

Thanks!

Reviewed-by: Omar Sandoval <osandov@fb.com>

> ---
>  block/blk-mq-debugfs.c | 9 +--------
>  1 file changed, 1 insertion(+), 8 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index a1ce823578c7..564470d4af52 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -149,11 +149,6 @@ static const struct file_operations blk_queue_flags_fops = {
>  	.write		= blk_queue_flags_store,
>  };
>  
> -static const struct blk_mq_debugfs_attr blk_queue_attrs[] = {
> -	{"state", 0600, &blk_queue_flags_fops},
> -	{},
> -};
> -
>  static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
>  {
>  	if (stat->nr_samples) {
> @@ -762,6 +757,7 @@ static const struct file_operations ctx_completed_fops = {
>  
>  static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
>  	{"poll_stat", 0400, &queue_poll_stat_fops},
> +	{"state", 0600, &blk_queue_flags_fops},
>  	{},
>  };
>  
> @@ -877,9 +873,6 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
>  	if (!q->debugfs_dir)
>  		return -ENOENT;
>  
> -	if (!debugfs_create_files(q->debugfs_dir, q, blk_queue_attrs))
> -		goto err;
> -
>  	q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
>  	if (!q->mq_debugfs_dir)
>  		goto err;
> -- 
> 2.12.0
> 

^ permalink raw reply

* Re: [PATCH 1/6] blk-mq: Do not invoke queue operations on a dead queue
From: Omar Sandoval @ 2017-04-13 23:01 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-2-bart.vanassche@sandisk.com>

On Tue, Apr 11, 2017 at 01:58:37PM -0700, Bart Van Assche wrote:
> The blk-mq debugfs attributes are removed after blk_cleanup_queue()
> has finished. Since running a queue after a queue has entered the
> "dead" state is not allowed, disallow this. This patch avoids that
> an attempt to run a dead queue triggers a kernel crash.
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Omar Sandoval <osandov@fb.com>
> Cc: Hannes Reinecke <hare@suse.com>
> ---
>  block/blk-mq-debugfs.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index df9b688b877c..a1ce823578c7 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -111,6 +111,14 @@ static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
>  	struct request_queue *q = file_inode(file)->i_private;
>  	char op[16] = { }, *s;
>  
> +	/*
> +	 * The debugfs attributes are removed after blk_cleanup_queue() has
> +	 * called blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set
> +	 * to avoid triggering a use-after-free.
> +	 */
> +	if (blk_queue_dead(q))
> +		return -ENOENT;
> +
>  	len = min(len, sizeof(op) - 1);
>  	if (copy_from_user(op, ubuf, len))
>  		return -EFAULT;
> -- 
> 2.12.0
> 

Hi, Bart,

Looking at this, I think we have similar issues with most of the other
debugfs files. Should we move the debugfs cleanup earlier?

^ permalink raw reply

* Re: [PATCH 08/25] scsi: fix fast-fail for non-passthrough requests
From: Bart Van Assche @ 2017-04-13 20:41 UTC (permalink / raw)
  To: hch@lst.de, axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, konrad.wilk@oracle.com,
	roger.pau@citrix.com, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, jbacik@fb.com,
	james.smart@broadcom.com, dm-devel@redhat.com
In-Reply-To: <20170406153944.10058-9-hch@lst.de>

On Thu, 2017-04-06 at 17:39 +0200, Christoph Hellwig wrote:
> Currently error is always 0 for non-passthrough requests when reaching th=
e
> scsi_noretry_cmd check in scsi_io_completion, which effectively disables
> all fastfail logic.  Fix this by having a single call to
> __scsi_error_from_host_byte at the beginning of the function and always
> having a valid error value.
>=20
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/scsi/scsi_lib.c | 28 +++++++---------------------
>  1 file changed, 7 insertions(+), 21 deletions(-)
>=20
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 11972d1075f1..89b4d9e69866 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -779,21 +779,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsi=
gned int good_bytes)
>  		sense_valid =3D scsi_command_normalize_sense(cmd, &sshdr);
>  		if (sense_valid)
>  			sense_deferred =3D scsi_sense_is_deferred(&sshdr);
> +
> +		if (!sense_deferred)
> +			error =3D __scsi_error_from_host_byte(cmd, result);
>  	}

Hello Christoph,

Sorry but this doesn't look correct to me. Further down a "error =3D
__scsi_error_from_host_byte(cmd, result)" statement is removed that does no=
t
depend on "if (!sense_deferred)" so I think that assignment should happen
independent of the value of "sense_deferred". Additionally, how can it make
sense to call __scsi_error_from_host_byte() only if sense_deferred =3D=3D f=
alse?
As you know the SCSI command result is generated by the LLD so I don't thin=
k
that it depends on whether or not the sense data has been deferred.

Bart.=

^ permalink raw reply

* Re: [PATCH 06/25] virtio: fix spelling of virtblk_scsi_request_done
From: Bart Van Assche @ 2017-04-13 20:05 UTC (permalink / raw)
  To: hch@lst.de, axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, konrad.wilk@oracle.com,
	roger.pau@citrix.com, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, jbacik@fb.com,
	james.smart@broadcom.com, dm-devel@redhat.com
In-Reply-To: <20170406153944.10058-7-hch@lst.de>

On Thu, 2017-04-06 at 17:39 +0200, Christoph Hellwig wrote:
> [ ... ]

Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>=

^ permalink raw reply

* Re: [PATCH 02/25] block: remove the blk_execute_rq return value
From: Bart Van Assche @ 2017-04-13 20:03 UTC (permalink / raw)
  To: hch@lst.de, axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, konrad.wilk@oracle.com,
	roger.pau@citrix.com, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, jbacik@fb.com,
	james.smart@broadcom.com, dm-devel@redhat.com
In-Reply-To: <20170406153944.10058-3-hch@lst.de>

On Thu, 2017-04-06 at 17:39 +0200, Christoph Hellwig wrote:
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> index 92b4b41d19d2..4b72fdf67548 100644
> --- a/fs/nfsd/blocklayout.c
> +++ b/fs/nfsd/blocklayout.c
> @@ -242,8 +242,8 @@ static int nfsd4_scsi_identify_device(struct block_de=
vice *bdev,
>  	req->cmd[4] =3D bufflen & 0xff;
>  	req->cmd_len =3D COMMAND_SIZE(INQUIRY);
> =20
> -	error =3D blk_execute_rq(rq->q, NULL, rq, 1);
> -	if (error) {
> +	blk_execute_rq(rq->q, NULL, rq, 1);
> +	if (rq->errors) {
>  		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
>  			rq->errors);
>  		goto out_put_request;

Hello Christoph,

That blk_execute_rq() call can only be reached if a few lines above 0 was
assigned to the "error" variable. Since nfsd4_scsi_identify_device() return=
s
the value of the "error" variable I think -EIO should be assigned to that
variable before the "goto out_put_request" statement is reached.

Bart.=

^ permalink raw reply

* Re: [PATCH 01/25] remove the mg_disk driver
From: Bart Van Assche @ 2017-04-13 19:58 UTC (permalink / raw)
  To: hch@lst.de, axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, konrad.wilk@oracle.com,
	roger.pau@citrix.com, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, jbacik@fb.com,
	james.smart@broadcom.com, dm-devel@redhat.com
In-Reply-To: <20170406153944.10058-2-hch@lst.de>

On Thu, 2017-04-06 at 17:39 +0200, Christoph Hellwig wrote:
> This drivers was added in 2008, but as far as a I can tell we never had a
> single platform that actually registered resources for the platform drive=
r.
>=20
> It's also been unmaintained for a long time and apparently has a ATA mode
> that can be driven using the IDE/libata subsystem.

Hello Christoph,

Should the person who submitted this driver be CC-ed for this patch (unsik
Kim <donari75@gmail.com>)?

Bart.=

^ permalink raw reply

* Re: [PATCH] lightnvm: fix some WARN() messages
From: Matias Bjørling @ 2017-04-13 19:44 UTC (permalink / raw)
  To: Dan Carpenter, Javier González; +Cc: linux-block, kernel-janitors
In-Reply-To: <20170413193636.GC591@mwanda>



On 04/13/2017 09:36 PM, Dan Carpenter wrote:
> WARN_ON() takes a condition, not an error message.  I slightly tweaked
> some conditions so hopefully it's more clear.
>
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
>
> diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
> index eff0982c076f..bce7ed5fc73f 100644
> --- a/drivers/lightnvm/pblk-read.c
> +++ b/drivers/lightnvm/pblk-read.c
> @@ -49,8 +49,8 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
>  	int i, j = 0;
>
>  	/* logic error: lba out-of-bounds. Ignore read request */
> -	if (!(blba + nr_secs < pblk->rl.nr_secs)) {
> -		WARN_ON("pblk: read lbas out of bounds\n");
> +	if (blba + nr_secs >= pblk->rl.nr_secs) {
> +		WARN(1, "pblk: read lbas out of bounds\n");
>  		return;
>  	}
>
> @@ -254,8 +254,8 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
>  	sector_t lba = pblk_get_lba(bio);
>
>  	/* logic error: lba out-of-bounds. Ignore read request */
> -	if (!(lba < pblk->rl.nr_secs)) {
> -		WARN_ON("pblk: read lba out of bounds\n");
> +	if (lba >= pblk->rl.nr_secs) {
> +		WARN(1, "pblk: read lba out of bounds\n");
>  		return;
>  	}
>
> @@ -411,8 +411,8 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
>  	int valid_secs = 0;
>
>  	/* logic error: lba out-of-bounds */
> -	if (!(lba < pblk->rl.nr_secs)) {
> -		WARN_ON("pblk: read lba out of bounds\n");
> +	if (lba >= pblk->rl.nr_secs) {
> +		WARN(1, "pblk: read lba out of bounds\n");
>  		goto out;
>  	}
>
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 02c415b957d4..56547ca87926 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -141,7 +141,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
>
>  		/* Logic error */
>  		if (bit > c_ctx->nr_valid) {
> -			WARN_ON_ONCE("pblk: corrupted write request\n");
> +			WARN_ONCE(1, "pblk: corrupted write request\n");
>  			goto out;
>  		}
>
> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
> index 0d50f415cfde..f8f85087cd3c 100644
> --- a/drivers/lightnvm/pblk-recovery.c
> +++ b/drivers/lightnvm/pblk-recovery.c
> @@ -167,7 +167,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
>  		if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
>  			spin_lock(&line->lock);
>  			if (test_and_set_bit(i, line->invalid_bitmap))
> -				WARN_ON_ONCE("pblk: rec. double invalidate:\n");
> +				WARN_ONCE(1, "pblk: rec. double invalidate:\n");
>  			else
>  				line->vsc--;
>  			spin_unlock(&line->lock);
>

Hehe, yep. That was slightly misused. Thanks for pointing it out. I've 
applied it for 4.12.

^ permalink raw reply

* Re: [PATCH] lightnvm: fix some error code in pblk-init.c
From: Matias Bjørling @ 2017-04-13 19:43 UTC (permalink / raw)
  To: Dan Carpenter, Javier González; +Cc: linux-block, kernel-janitors
In-Reply-To: <20170413193512.GB591@mwanda>



On 04/13/2017 09:35 PM, Dan Carpenter wrote:
> There were a bunch of places in pblk_lines_init() where we didn't set an
> error code.  And in pblk_writer_init() we accidentally return 1 instead
> of a correct error code, which would result in a Oops later.
>
> Fixes: 11a5d6fdf919 ("lightnvm: physical block device (pblk) target")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
>
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 94653b1f1300..3996e4b8fb0e 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -543,7 +543,7 @@ static int pblk_lines_init(struct pblk *pblk)
>  	long nr_bad_blks, nr_meta_blks, nr_free_blks;
>  	int bb_distance;
>  	int i;
> -	int ret = 0;
> +	int ret;
>
>  	lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
>  	lm->blk_per_line = geo->nr_luns;
> @@ -638,12 +638,16 @@ static int pblk_lines_init(struct pblk *pblk)
>  	}
>
>  	l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
> -	if (!l_mg->bb_template)
> +	if (!l_mg->bb_template) {
> +		ret = -ENOMEM;
>  		goto fail_free_meta;
> +	}
>
>  	l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
> -	if (!l_mg->bb_aux)
> +	if (!l_mg->bb_aux) {
> +		ret = -ENOMEM;
>  		goto fail_free_bb_template;
> +	}
>
>  	bb_distance = (geo->nr_luns) * geo->sec_per_pl;
>  	for (i = 0; i < lm->sec_per_line; i += bb_distance)
> @@ -667,8 +671,10 @@ static int pblk_lines_init(struct pblk *pblk)
>
>  	pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
>  								GFP_KERNEL);
> -	if (!pblk->lines)
> +	if (!pblk->lines) {
> +		ret = -ENOMEM;
>  		goto fail_free_bb_aux;
> +	}
>
>  	nr_free_blks = 0;
>  	for (i = 0; i < l_mg->nr_lines; i++) {
> @@ -682,8 +688,10 @@ static int pblk_lines_init(struct pblk *pblk)
>  		spin_lock_init(&line->lock);
>
>  		nr_bad_blks = pblk_bb_line(pblk, line);
> -		if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line)
> +		if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) {
> +			ret = -EINVAL;
>  			goto fail_free_lines;
> +		}
>
>  		line->blk_in_line = lm->blk_per_line - nr_bad_blks;
>  		if (line->blk_in_line < lm->min_blk_line) {
> @@ -733,7 +741,7 @@ static int pblk_writer_init(struct pblk *pblk)
>  	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>  	if (IS_ERR(pblk->writer_ts)) {
>  		pr_err("pblk: could not allocate writer kthread\n");
> -		return 1;
> +		return PTR_ERR(pblk->writer_ts);
>  	}
>
>  	return 0;
>

Thanks Dan. Applied for 4.12.

^ permalink raw reply

* [PATCH block-tree] net: off by one in inet6_pton()
From: Dan Carpenter @ 2017-04-13 19:42 UTC (permalink / raw)
  To: David S. Miller, Jens Axboe, Sagi Grimberg
  Cc: Wei Tang, Alexey Dobriyan, netdev, linux-block, kernel-janitors

If "scope_len" is sizeof(scope_id) then we would put the NUL terminator
one space beyond the end of the buffer.

Fixes: b1a951fe469e ("net/utils: generic inet_pton_with_scope helper")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
This one goes through Jens' tree not through net-dev.

diff --git a/net/core/utils.c b/net/core/utils.c
index da1089ea5389..93066bd0305a 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -339,7 +339,7 @@ static int inet6_pton(struct net *net, const char *src, u16 port_num,
 	    src + srclen != scope_delim && *scope_delim == '%') {
 		struct net_device *dev;
 		char scope_id[16];
-		size_t scope_len = min_t(size_t, sizeof(scope_id),
+		size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
 					 src + srclen - scope_delim - 1);

 		memcpy(scope_id, scope_delim + 1, scope_len);

^ permalink raw reply related

* [PATCH] lightnvm: fix some WARN() messages
From: Dan Carpenter @ 2017-04-13 19:36 UTC (permalink / raw)
  To: Matias Bjorling, Javier González; +Cc: linux-block, kernel-janitors

WARN_ON() takes a condition, not an error message.  I slightly tweaked
some conditions so hopefully it's more clear.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index eff0982c076f..bce7ed5fc73f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -49,8 +49,8 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	int i, j = 0;
 
 	/* logic error: lba out-of-bounds. Ignore read request */
-	if (!(blba + nr_secs < pblk->rl.nr_secs)) {
-		WARN_ON("pblk: read lbas out of bounds\n");
+	if (blba + nr_secs >= pblk->rl.nr_secs) {
+		WARN(1, "pblk: read lbas out of bounds\n");
 		return;
 	}
 
@@ -254,8 +254,8 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	sector_t lba = pblk_get_lba(bio);
 
 	/* logic error: lba out-of-bounds. Ignore read request */
-	if (!(lba < pblk->rl.nr_secs)) {
-		WARN_ON("pblk: read lba out of bounds\n");
+	if (lba >= pblk->rl.nr_secs) {
+		WARN(1, "pblk: read lba out of bounds\n");
 		return;
 	}
 
@@ -411,8 +411,8 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
 	int valid_secs = 0;
 
 	/* logic error: lba out-of-bounds */
-	if (!(lba < pblk->rl.nr_secs)) {
-		WARN_ON("pblk: read lba out of bounds\n");
+	if (lba >= pblk->rl.nr_secs) {
+		WARN(1, "pblk: read lba out of bounds\n");
 		goto out;
 	}
 
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 02c415b957d4..56547ca87926 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -141,7 +141,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
 
 		/* Logic error */
 		if (bit > c_ctx->nr_valid) {
-			WARN_ON_ONCE("pblk: corrupted write request\n");
+			WARN_ONCE(1, "pblk: corrupted write request\n");
 			goto out;
 		}
 
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 0d50f415cfde..f8f85087cd3c 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -167,7 +167,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
 		if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
 			spin_lock(&line->lock);
 			if (test_and_set_bit(i, line->invalid_bitmap))
-				WARN_ON_ONCE("pblk: rec. double invalidate:\n");
+				WARN_ONCE(1, "pblk: rec. double invalidate:\n");
 			else
 				line->vsc--;
 			spin_unlock(&line->lock);

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox