Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 07/25] 9p: Convert to separately allocated bdi
From: Jan Kara @ 2017-03-29 10:56 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-block, Christoph Hellwig, Jan Kara, Eric Van Hensbergen,
	Ron Minnich, Latchesar Ionkov, v9fs-developer
In-Reply-To: <20170329105623.18241-1-jack@suse.cz>

Allocate struct backing_dev_info separately instead of embedding it
inside session. This unifies handling of bdi among users.

CC: Eric Van Hensbergen <ericvh@gmail.com>
CC: Ron Minnich <rminnich@sandia.gov>
CC: Latchesar Ionkov <lucho@ionkov.net>
CC: v9fs-developer@lists.sourceforge.net
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/9p/v9fs.c      | 10 +---------
 fs/9p/v9fs.h      |  1 -
 fs/9p/vfs_super.c | 15 ++++++++++++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a89f3cfe3c7d..c202930086ed 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		goto err_names;
 	init_rwsem(&v9ses->rename_sem);
 
-	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
-	if (rc)
-		goto err_names;
-
 	v9ses->uid = INVALID_UID;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (IS_ERR(v9ses->clnt)) {
 		rc = PTR_ERR(v9ses->clnt);
 		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
-		goto err_bdi;
+		goto err_names;
 	}
 
 	v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 err_clnt:
 	p9_client_destroy(v9ses->clnt);
-err_bdi:
-	bdi_destroy(&v9ses->bdi);
 err_names:
 	kfree(v9ses->uname);
 	kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 	kfree(v9ses->uname);
 	kfree(v9ses->aname);
 
-	bdi_destroy(&v9ses->bdi);
-
 	spin_lock(&v9fs_sessionlist_lock);
 	list_del(&v9ses->slist);
 	spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e02043..76eaf49abd3a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
 	kuid_t uid;		/* if ACCESS_SINGLE, the uid that has access */
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
-	struct backing_dev_info bdi;
 	struct rw_semaphore rename_sem;
 };
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed8629196..a0965fb587a5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
  *
  */
 
-static void
+static int
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		int flags, void *data)
 {
+	int ret;
+
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
 	sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		sb->s_xattr = v9fs_xattr_handlers;
 	} else
 		sb->s_op = &v9fs_super_ops;
-	sb->s_bdi = &v9ses->bdi;
+
+	ret = super_setup_bdi(sb);
+	if (ret)
+		return ret;
+
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 #endif
 
 	save_mount_options(sb, data);
+	return 0;
 }
 
 /**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
-	v9fs_fill_super(sb, v9ses, flags, data);
+	retval = v9fs_fill_super(sb, v9ses, flags, data);
+	if (retval)
+		goto release_sb;
 
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		sb->s_d_op = &v9fs_cached_dentry_operations;
-- 
2.10.2

^ permalink raw reply related

* [PATCH 06/25] lustre: Convert to separately allocated bdi
From: Jan Kara @ 2017-03-29 10:56 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-block, Christoph Hellwig, Jan Kara, Oleg Drokin,
	Andreas Dilger, James Simmons, lustre-devel
In-Reply-To: <20170329105623.18241-1-jack@suse.cz>

Allocate struct backing_dev_info separately instead of embedding it
inside superblock. This unifies handling of bdi among users.

CC: Oleg Drokin <oleg.drokin@intel.com>
CC: Andreas Dilger <andreas.dilger@intel.com>
CC: James Simmons <jsimmons@infradead.org>
CC: lustre-devel@lists.lustre.org
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 .../staging/lustre/lustre/include/lustre_disk.h    |  4 ----
 drivers/staging/lustre/lustre/llite/llite_lib.c    | 24 +++-------------------
 2 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
index 8886458748c1..a676bccabd43 100644
--- a/drivers/staging/lustre/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -133,13 +133,9 @@ struct lustre_sb_info {
 	struct obd_export	 *lsi_osd_exp;
 	char			  lsi_osd_type[16];
 	char			  lsi_fstype[16];
-	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
-						* own backing_dev_info
-						*/
 };
 
 #define LSI_UMOUNT_FAILOVER	      0x00200000
-#define LSI_BDI_INITIALIZED	      0x00400000
 
 #define     s2lsi(sb)	((struct lustre_sb_info *)((sb)->s_fs_info))
 #define     s2lsi_nocast(sb) ((sb)->s_fs_info)
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index b229cbc7bb33..d483c44aafe5 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -863,15 +863,6 @@ void ll_lli_init(struct ll_inode_info *lli)
 	mutex_init(&lli->lli_layout_mutex);
 }
 
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
-{
-	static atomic_t ll_bdi_num = ATOMIC_INIT(0);
-
-	bdi->name = "lustre";
-	return bdi_register(bdi, NULL, "lustre-%d",
-			    atomic_inc_return(&ll_bdi_num));
-}
-
 int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
 	struct lustre_profile *lprof = NULL;
@@ -881,6 +872,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	char  *profilenm = get_profile_name(sb);
 	struct config_llog_instance *cfg;
 	int    err;
+	static atomic_t ll_bdi_num = ATOMIC_INIT(0);
 
 	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 
@@ -903,16 +895,11 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	if (err)
 		goto out_free;
 
-	err = bdi_init(&lsi->lsi_bdi);
-	if (err)
-		goto out_free;
-	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-	lsi->lsi_bdi.capabilities = 0;
-	err = ll_bdi_register(&lsi->lsi_bdi);
+	err = super_setup_bdi_name(sb, "lustre-%d",
+				   atomic_inc_return(&ll_bdi_num));
 	if (err)
 		goto out_free;
 
-	sb->s_bdi = &lsi->lsi_bdi;
 	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
 	sb->s_d_op = &ll_d_ops;
 
@@ -1033,11 +1020,6 @@ void ll_put_super(struct super_block *sb)
 	if (profilenm)
 		class_del_profile(profilenm);
 
-	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
-		bdi_destroy(&lsi->lsi_bdi);
-		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
-	}
-
 	ll_free_sbi(sb);
 	lsi->lsi_llsbi = NULL;
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH 05/25] fs: Get proper reference for s_bdi
From: Jan Kara @ 2017-03-29 10:56 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-block, Christoph Hellwig, Jan Kara
In-Reply-To: <20170329105623.18241-1-jack@suse.cz>

So far we just relied on block device to hold a bdi reference for us
while the filesystem is mounted. While that works perfectly fine, it is
a bit awkward that we have a pointer to a refcounted structure in the
superblock without proper reference. So make s_bdi hold a proper
reference to block device's BDI. No filesystem using mount_bdev()
actually changes s_bdi so this is safe and will make bdev filesystems
work the same way as filesystems needing to set up their private bdi.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 0f51a437c269..e267d3a00144 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1054,12 +1054,9 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
+	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+	s->s_iflags |= SB_I_DYNBDI;

-	/*
-	 * We set the bdi here to the queue backing, file systems can
-	 * overwrite this in ->fill_super()
-	 */
-	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }

-- 
2.10.2

^ permalink raw reply related

* [PATCH 01/25] bdi: Provide bdi_register_va() and bdi_alloc()
From: Jan Kara @ 2017-03-29 10:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-block, Christoph Hellwig, Jan Kara
In-Reply-To: <20170329105623.18241-1-jack@suse.cz>

Add function that registers bdi and takes va_list instead of variable
number of arguments.

Add bdi_alloc() as simple wrapper for NUMA-unaware users allocating BDI.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/backing-dev.h |  6 ++++++
 mm/backing-dev.c            | 20 +++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a66..47a98e6e2a65 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -30,6 +30,8 @@ void bdi_put(struct backing_dev_info *bdi);
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
@@ -37,6 +39,10 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+	return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3ea3bbd921d6..e5e0972bdd6f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -856,18 +856,15 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 	return bdi;
 }
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args)
 {
-	va_list args;
 	struct device *dev;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 
-	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-	va_end(args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
@@ -884,6 +881,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	trace_writeback_bdi_register(bdi);
 	return 0;
 }
+EXPORT_SYMBOL(bdi_register_va);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = bdi_register_va(bdi, parent, fmt, args);
+	va_end(args);
+	return ret;
+}
 EXPORT_SYMBOL(bdi_register);
 
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-- 
2.10.2

^ permalink raw reply related

* [PATCH 0/25 v2] fs: Convert all embedded bdis into separate ones
From: Jan Kara @ 2017-03-29 10:55 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-block, Christoph Hellwig, Jan Kara, linux-mtd, linux-nfs,
	Petr Vandrovec, linux-nilfs, cluster-devel, osd-dev, codalist,
	linux-afs, ecryptfs, linux-cifs, ceph-devel, linux-btrfs,
	v9fs-developer, lustre-devel

Hello,

this is the second revision of the patch series which converts all embedded
occurences of struct backing_dev_info to use standalone dynamically allocated
structures. This makes bdi handling unified across all bdi users and generally
removes some boilerplate code from filesystems setting up their own bdi. It
also allows us to remove some code from generic bdi implementation.

The patches were only compile-tested for most filesystems (I've tested
mounting only for NFS & btrfs) so fs maintainers please have a look whether
the changes look sound to you.

This series is based on top of bdi fixes that were merged into linux-block
git tree into for-next branch. I have pushed out the result as a branch to

git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git bdi

Changes since v1:
* Added some acks
* Added further FUSE cleanup patch
* Added removal of unused argument to bdi_register()
* Fixed up some compilation failures spotted by 0-day testing

								Honza

^ permalink raw reply

* Outstanding MQ questions from MMC
From: Linus Walleij @ 2017-03-29  3:09 UTC (permalink / raw)
  To: linux-mmc@vger.kernel.org, linux-block, Jens Axboe,
	Christoph Hellwig
  Cc: Ulf Hansson, Adrian Hunter, Paolo Valente

Hi folks,

I earlier left some unanswered questions in my MMC to MQ conversion series
but I figured it is better if I collect them and ask the blk-mq
maintainers directly
how to deal with the following situations that occur in the MMC block layer:

1. The current MMC code locks the host when the first request comes in
from blk_fetch_request() and unlocks it when blk_fetch_request() returns
NULL twice in a row. Then the polling thread terminated and is not restarted
until we get called by the mmc_request_fn.

Host locking means that we will not send other commands to the MMC
card from i.e. userspace, which sometimes can send spurious stuff orthogonal
to the block layer. If the block layer has locked the host, userspace
has to wait
and vice versa. It is not a common contention point but it still happens.

In MQ, I have simply locked the host on the first request and then I never
release it. Clearly this does not work. I am uncertain on how to handle this
and whether MQ has a way to tell us that the queue is empty so we may release
the host. I toyed with the idea to just set up a timer, but a "queue
empty" callback
from the block layer is what would be ideal.

2. When MMC cards are ejected a serious error condition occurs. So for this
reason we spool out the queue with

req->rq_flags |= RQF_QUIET;
blk_end_request_all(req, -EIO);

This will shut up a huge amount of console errors for example.
I have no clue on how to manage this with MQ. I am currently using

blk_mq_complete_request(mq_rq->req, -EIO);

and nothing else, and it will hit all requests for the ejected card coming
in from this point. Is this the right solution? Or is there some medium
eject handling I'm not aware of inside the MQ layer? It seems like something
that can happen on pluggable harddrives and CDROMS and what not.

3. Sometimes a read or write gets partially completed. Say we read 12 out
of 15 sectors or somthing like that. I have no idea how often this occurs in
practice. With the old block layer we did this:

blk_end_request(req, 0, bytes_xfered);

It seems MQ cannot handle partially completed transfers. I currently do this:

blk_mq_requeue_request(req, false);

I do not feel that is the right thing to do either, and would like
recommendations
on how to proceed with this.

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH v3 0/4] block: misc changes
From: Ming Lei @ 2017-03-29  2:11 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Christoph Hellwig
  Cc: Bart Van Assche, Hannes Reinecke
In-Reply-To: <20170327120658.29864-1-tom.leiming@gmail.com>

On Mon, Mar 27, 2017 at 08:06:54PM +0800, Ming Lei wrote:
> Hi,
> 
> The 1st  patch add comments on blk-mq races with timeout handler.
> 
> The other 3 patches improves handling for dying queue:
> 	- the 2nd one adds one barrier in blk_queue_enter() for
> 	avoiding hanging caused by out-of-order
> 	- the 3rd and 4th patches block new I/O entering queue
> 	after queue is set as dying
> 
> V3:
> 	- tweak comments as suggested by Bart Van Assche

Hi Jens,

This patchset has been posted several times without real code change
for a while, could you consider it for v4.12?

Thanks,
Ming

^ permalink raw reply

* Re: [PATCH] block-mq: don't re-queue if we get a queue error
From: Ming Lei @ 2017-03-29  2:04 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-block, kernel-team
In-Reply-To: <1490733472-3088-1-git-send-email-jbacik@fb.com>

On Tue, Mar 28, 2017 at 04:37:52PM -0400, Josef Bacik wrote:
> When try to issue a request directly and we fail we will requeue the
> request, but call blk_mq_end_request() as well.  This leads to the
> completed request being on a queuelist and getting ended twice, which
> causes list corruption in schedulers and other shenanigans.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  block/blk-mq.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 08a49c6..c5a6985 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1457,8 +1457,6 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
>  		return;
>  	}
>  
> -	__blk_mq_requeue_request(rq);
> -
>  	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
>  		*cookie = BLK_QC_T_NONE;
>  		rq->errors = -EIO;
> @@ -1466,6 +1464,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
>  		return;
>  	}
>  
> +	__blk_mq_requeue_request(rq);
>  insert:
>  	blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
>  }

Reviewed-by: Ming Lei <tom.leiming@gmail.com>

-- 
Ming

^ permalink raw reply

* [PATCH v2 2/3] blk-mq: fix leak of q->stats
From: Omar Sandoval @ 2017-03-28 23:12 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team
In-Reply-To: <91d938665e5386f10f8756ee8c7ef3e6496ea260.1490742717.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

blk_alloc_queue_node() already allocates q->stats, so
blk_mq_init_allocated_queue() is overwriting it with a new allocation.

Fixes: a83b576c9c25 ("block: fix stacked driver stats init and free")
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7df9dbfab022..00eeef31db30 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2212,10 +2212,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
-	q->stats = blk_alloc_queue_stats();
-	if (!q->stats)
-		goto err_exit;
-
 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
 					     blk_stat_rq_ddir, 2, q);
 	if (!q->poll_cb)
-- 
2.12.1

^ permalink raw reply related

* [PATCH v2 1/3] block: warn if sharing request queue across gendisks
From: Omar Sandoval @ 2017-03-28 23:12 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team

From: Omar Sandoval <osandov@fb.com>

Now that the remaining drivers have been converted to one request queue
per gendisk, let's warn if a request queue gets registered more than
once. This will catch future drivers which might do it inadvertently or
any old drivers that I may have missed.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-sysfs.c      | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7f090dd15ca6..833fb7f9ce9d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -871,6 +871,11 @@ int blk_register_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+		  "%s is registering an already registered queue\n",
+		  kobject_name(&dev->kobj));
+	queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	/*
 	 * SCSI probing may synchronously create and destroy a lot of
 	 * request_queues for non-existent devices.  Shutting down a fully
@@ -931,6 +936,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
+	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a7dc42a8918..a2dc6b390d48 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -617,6 +617,7 @@ struct request_queue {
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 #define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED  30	/* queue has been registered to a disk */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
2.12.1

^ permalink raw reply related

* [PATCH v2 3/3] block: fix leak of q->rq_wb
From: Omar Sandoval @ 2017-03-28 23:12 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: kernel-team
In-Reply-To: <91d938665e5386f10f8756ee8c7ef3e6496ea260.1490742717.git.osandov@fb.com>

From: Omar Sandoval <osandov@fb.com>

CONFIG_DEBUG_TEST_DRIVER_REMOVE found a possible leak of q->rq_wb when a
request queue is reregistered. This has been a problem since wbt was
introduced, but the WARN_ON(!list_empty(&stats->callbacks)) in the
blk-stat rework exposed it. Fix it by cleaning up wbt when we unregister
the queue.

Fixes: 87760e5eef35 ("block: hook up writeback throttling")
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 833fb7f9ce9d..45854266e398 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -795,7 +795,6 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
-	wbt_exit(q);
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
@@ -938,6 +937,9 @@ void blk_unregister_queue(struct gendisk *disk)
 
 	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
 
+	wbt_exit(q);
+
+
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
-- 
2.12.1

^ permalink raw reply related

* Re: [PATCH v5] blkcg: allocate struct blkcg_gq outside request queue spinlock
From: Tahsin Erdogan @ 2017-03-28 22:01 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tejun Heo, linux-block, David Rientjes, linux-kernel
In-Reply-To: <835b6175-6f18-164c-2363-74023ae6cbcb@kernel.dk>

Thanks!

On Tue, Mar 28, 2017 at 2:59 PM, Jens Axboe <axboe@kernel.dk> wrote:
> On 03/09/2017 01:05 AM, Tahsin Erdogan wrote:
>> blkg_conf_prep() currently calls blkg_lookup_create() while holding
>> request queue spinlock. This means allocating memory for struct
>> blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
>> failures in call paths like below:
>>
>>   pcpu_alloc+0x68f/0x710
>>   __alloc_percpu_gfp+0xd/0x10
>>   __percpu_counter_init+0x55/0xc0
>>   cfq_pd_alloc+0x3b2/0x4e0
>>   blkg_alloc+0x187/0x230
>>   blkg_create+0x489/0x670
>>   blkg_lookup_create+0x9a/0x230
>>   blkg_conf_prep+0x1fb/0x240
>>   __cfqg_set_weight_device.isra.105+0x5c/0x180
>>   cfq_set_weight_on_dfl+0x69/0xc0
>>   cgroup_file_write+0x39/0x1c0
>>   kernfs_fop_write+0x13f/0x1d0
>>   __vfs_write+0x23/0x120
>>   vfs_write+0xc2/0x1f0
>>   SyS_write+0x44/0xb0
>>   entry_SYSCALL_64_fastpath+0x18/0xad
>>
>> In the code path above, percpu allocator cannot call vmalloc() due to
>> queue spinlock.
>>
>> A failure in this call path gives grief to tools which are trying to
>> configure io weights. We see occasional failures happen shortly after
>> reboots even when system is not under any memory pressure. Machines
>> with a lot of cpus are more vulnerable to this condition.
>>
>> Update blkg_create() function to temporarily drop the rcu and queue
>> locks when it is allowed by gfp mask.
>
> Added for 4.12, thanks for your persistence in pulling this through.
>
> --
> Jens Axboe
>

^ permalink raw reply

* Re: [PATCH v5] blkcg: allocate struct blkcg_gq outside request queue spinlock
From: Jens Axboe @ 2017-03-28 21:59 UTC (permalink / raw)
  To: Tahsin Erdogan, Tejun Heo; +Cc: linux-block, David Rientjes, linux-kernel
In-Reply-To: <20170309080531.9048-1-tahsin@google.com>

On 03/09/2017 01:05 AM, Tahsin Erdogan wrote:
> blkg_conf_prep() currently calls blkg_lookup_create() while holding
> request queue spinlock. This means allocating memory for struct
> blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
> failures in call paths like below:
> 
>   pcpu_alloc+0x68f/0x710
>   __alloc_percpu_gfp+0xd/0x10
>   __percpu_counter_init+0x55/0xc0
>   cfq_pd_alloc+0x3b2/0x4e0
>   blkg_alloc+0x187/0x230
>   blkg_create+0x489/0x670
>   blkg_lookup_create+0x9a/0x230
>   blkg_conf_prep+0x1fb/0x240
>   __cfqg_set_weight_device.isra.105+0x5c/0x180
>   cfq_set_weight_on_dfl+0x69/0xc0
>   cgroup_file_write+0x39/0x1c0
>   kernfs_fop_write+0x13f/0x1d0
>   __vfs_write+0x23/0x120
>   vfs_write+0xc2/0x1f0
>   SyS_write+0x44/0xb0
>   entry_SYSCALL_64_fastpath+0x18/0xad
> 
> In the code path above, percpu allocator cannot call vmalloc() due to
> queue spinlock.
> 
> A failure in this call path gives grief to tools which are trying to
> configure io weights. We see occasional failures happen shortly after
> reboots even when system is not under any memory pressure. Machines
> with a lot of cpus are more vulnerable to this condition.
> 
> Update blkg_create() function to temporarily drop the rcu and queue
> locks when it is allowed by gfp mask.

Added for 4.12, thanks for your persistence in pulling this through.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
From: Tejun Heo @ 2017-03-28 21:53 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tahsin Erdogan, linux-block, David Rientjes, linux-kernel
In-Reply-To: <5b9ab59c-4121-d49a-1dc5-bd419f3ac94f@kernel.dk>

On Fri, Mar 24, 2017 at 04:04:32PM -0600, Jens Axboe wrote:
> On 03/24/2017 03:56 PM, Tahsin Erdogan wrote:
> > blkg_conf_prep() currently calls blkg_lookup_create() while holding
> > request queue spinlock. This means allocating memory for struct
> > blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
> > failures in call paths like below:
> > 
> >   pcpu_alloc+0x68f/0x710
> >   __alloc_percpu_gfp+0xd/0x10
> >   __percpu_counter_init+0x55/0xc0
> >   cfq_pd_alloc+0x3b2/0x4e0
> >   blkg_alloc+0x187/0x230
> >   blkg_create+0x489/0x670
> >   blkg_lookup_create+0x9a/0x230
> >   blkg_conf_prep+0x1fb/0x240
> >   __cfqg_set_weight_device.isra.105+0x5c/0x180
> >   cfq_set_weight_on_dfl+0x69/0xc0
> >   cgroup_file_write+0x39/0x1c0
> >   kernfs_fop_write+0x13f/0x1d0
> >   __vfs_write+0x23/0x120
> >   vfs_write+0xc2/0x1f0
> >   SyS_write+0x44/0xb0
> >   entry_SYSCALL_64_fastpath+0x18/0xad
> > 
> > In the code path above, percpu allocator cannot call vmalloc() due to
> > queue spinlock.
> > 
> > A failure in this call path gives grief to tools which are trying to
> > configure io weights. We see occasional failures happen shortly after
> > reboots even when system is not under any memory pressure. Machines
> > with a lot of cpus are more vulnerable to this condition.
> > 
> > Do struct blkcg_gq allocations outside the queue spinlock to allow
> > blocking during memory allocations.
> 
> This looks much simpler/cleaner to me, compared to v5. Tejun, what do
> you think?

So, this patch in itself looks better but now we end up with two
separate mechanisms to handle non-atomic allocations.  This drop lock
/ alloc / relock / check invariants in the main path and preallocation
logic used in the init path.  Right now, both proposed implementations
aren't that satisfactory.  Personally, I'd prefer superficial ugliness
to structural duplications, but, ideally, we shouldn't have to make
this choice.  idk, it's a bug fix.  We can always clean things up
later.

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Stephen  Bates @ 2017-03-28 21:52 UTC (permalink / raw)
  To: Jens Axboe, Sagi Grimberg
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <fd49a447-153e-cad9-63ca-216ab6db7fb8@kernel.dk>

W1JldHJ5aW5nIGFzIG15IG5ldyBzZXR1cCBzZWNyZXRseSBjb252ZXJ0ZWQgdG8gaHRtbCBmb3Jt
YXQgd2l0aG91dCB0ZWxsaW5nIG1lLiBBcG9sb2dpZXMgZm9yIHRoZSByZXNlbmQuXQ0KDQo+Pj4g
DQo+Pj4gVGhhbmtzIGZvciB0aGUgcmV2aWV3IFNhZ2kuIEnigJlkIGJlIE9LIGdvaW5nIHdpdGgg
PD0wIGFzIHRoZSBleGFjdA0KPj4+IG1hdGNoIHdvdWxkIG5vcm1hbGx5IGJlIGZvciBtaW5pbWFs
IElPIHNpemVzICh3aGVyZSA8PSBhbmQgPSBhcmUgdGhlDQo+Pj4gc2FtZSB0aGluZykuIEkgd2ls
bCBzZWUgd2hhdCBvdGhlciBmZWVkYmFjayBJIGdldCBhbmQgYWltIHRvIGRvIGENCj4+PiByZXNw
aW4gc29vbuKApg0KPj4gDQo+PiBObyB0dW5hYmxlcyBmb3IgdGhpcywgcGxlYXNlLiBUaGVyZSdz
IGFic29sdXRlbHkgbm8gcmVhc29uIHdoeSB3ZSBzaG91bGQNCj4+IG5lZWQgaXQuDQo+DQo+IEpl
bnMg4oCTIGJ5IHRoaXMgeW91IG1lYW4geW91IHdhbnQgdG8gb25seSBidWNrZXQgSU8gdGhhdCBh
cmUgZXhhY3RseQ0KPiB0aGUgbWluaW11bSBibG9jayBzaXplIHN1cHBvcnRlZCBieSB0aGUgdW5k
ZXJseWluZyBibG9jayBkZXZpY2U/IEkgd2FzDQo+IGVudmlzaW9uaW5nIHdlIG1pZ2h0IHdhbnQg
dG8gcmVsYXggdGhhdCBpbiBjZXJ0YWluIGNhc2VzIChlLmcuIGJ1Y2tldA0KPiA0S0IgYW5kIGJl
bG93IGdvaW5nIHRvIGEgNTEyQiBkZXZpY2UpLg0KIA0KPiBTb3JyeSwgdGhlIGFib3ZlIHdhcyBh
IGJpdCB0ZXJzZS4gSSB0aGluayBhIG11Y2ggYmV0dGVyIHNvbHV0aW9uIHdvdWxkDQo+IGJlIHRv
IGNyZWF0ZSBhIG51bWJlciBvZiBidWNrZXRzIChwZXIgZGF0YSBkaXJlY3Rpb24pIGFuZCBkbyBz
dGF0cyBvbg0KPiBhbGwgb2YgdGhlbS4gVGhlIGJ1Y2tldHMgd291bGQgY292ZXIgYSByZWFzb25h
YmxlIHJhbmdlIG9mIHJlcXVlc3QNCj4gc2l6ZXMuIFRoZW4gd2hlbiB5b3UgcG9sbCBmb3IgYSBn
aXZlbiByZXF1ZXN0LCB3ZSBjYW4gYmFzZSBvdXIgdGltZWQNCj4gbnVtYmVyIG9uIHRoZSBkYXRh
IGRpcmVjdGlvbiBBTkQgc2l6ZSBvZiBpdC4gWW91IGNhbiBnZXQgcHJldHR5IGZhciB3aXRoDQo+
IGEgZmV3IGJ1Y2tldHM6DQo+IA0KPiA1MTJiDQo+IDRrDQo+IDhrDQo+IDE2aw0KPiAzMmsNCj4g
NjRrDQo+IDEyOGsNCj4gDQo+IGFuZCB5b3UgY291bGQgZXZlbiBoYXZlIHlvdXIgdGltZSBlc3Rp
bWF0aW9uIGZ1bmN0aW9uIHR1cm4gdGhlc2UgaW50bw0KPiBzb21ldGhpbmcgc2FuZS4gT3IganVz
dCB1c2UgYSBjb21wb3NpdGUgb2YgYnVja2V0cywgaWYgeW91IHdpc2guDQogDQpJIGRpZCBnbyBk
b3duIHRoaXMgcGF0aCBpbml0aWFsbHkgYnV0IHRoZW4gbW92ZWQgYXdheSBmcm9tIGl0IHNpbmNl
IHdlIHdlcmUgZm9jdXNpbmcgb25seSBvbiB0aGUgc21hbGxlciBJTyBzaXplLiBIb3dldmVyIEkg
Y2FuIGRlZmluaXRlbHkgdGFrZSBhIGxvb2sgYXQgdGhpcyBhZ2FpbiBhcyBJIGFncmVlIHRoYXQg
aXQgY291bGQgYmUgbW9yZSB1c2VmdWwgaW4gdGhlIGxvbmcgcnVuLg0KIA0KSSB3b3VsZCBsaWtl
IHRvIGtlZXAgbXkgZmlyc3QgcGF0Y2ggaW4gdGhpcyBzZXJpZXMgYWxpdmUgc2luY2UgSSBkbyB0
aGluayBoYXZpbmcgdGhlIG9wdGlvbiB0byBub3QgYnVja2V0IGFuIElPIGlzIGEgdXNlZnVsIHRo
aW5nIHRvIGhhdmUuDQogDQpJ4oCZbGwgdGFrZSBhbGwgdGhlIGZlZWRiYWNrIHRvIGRhdGUgYW5k
IHdvcmsgb24gYSB2Mi4gVGhhbmtzIQ0KIA0KU3RlcGhlbg0KDQoNCg0K

^ permalink raw reply

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Stephen  Bates @ 2017-03-28 21:49 UTC (permalink / raw)
  To: Jens Axboe, Sagi Grimberg
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <fd49a447-153e-cad9-63ca-216ab6db7fb8@kernel.dk>

[-- Attachment #1: Type: text/plain, Size: 1711 bytes --]

>>>

>>> Thanks for the review Sagi. I’d be OK going with <=0 as the exact

>>> match would normally be for minimal IO sizes (where <= and = are the

>>> same thing). I will see what other feedback I get and aim to do a

>>> respin soon…

>>

>> No tunables for this, please. There's absolutely no reason why we should

>> need it.

>

> Jens – by this you mean you want to only bucket IO that are exactly

> the minimum block size supported by the underlying block device? I was

> envisioning we might want to relax that in certain cases (e.g. bucket

> 4KB and below going to a 512B device).



> Sorry, the above was a bit terse. I think a much better solution would

> be to create a number of buckets (per data direction) and do stats on

> all of them. The buckets would cover a reasonable range of request

> sizes. Then when you poll for a given request, we can base our timed

> number on the data direction AND size of it. You can get pretty far with

> a few buckets:

>

> 512b

> 4k

> 8k

> 16k

> 32k

> 64k

> 128k

>

> and you could even have your time estimation function turn these into

> something sane. Or just use a composite of buckets, if you wish.



I did go down this path initially but then moved away from it since we were focusing only on the smaller IO size. However I can definitely take a look at this again as I agree that it could be more useful in the long run.



I would like to keep my first patch in this series alive since I do think having the option to not bucket an IO is a useful thing to have.



I’ll take all the feedback to date and work on a v2. Thanks!



Stephen





[-- Attachment #2: Type: text/html, Size: 5137 bytes --]

^ permalink raw reply

* Re: [PATCH 0/6] block: convert remaining drivers which share a request queue
From: Jens Axboe @ 2017-03-28 21:07 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Vivek Goyal, linux-block, kernel-team
In-Reply-To: <cover.1490681910.git.osandov@fb.com>

On Mon, Mar 27 2017, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> This is clearly the pinnacle of my career: converting all remaining
> block drivers which share a request queue across gendisks to use a
> separate request queue per gendisk. These are all compile tested (but
> the last two platform-specific ones involved hacking the Kconfig and
> commenting out a bunch of arch-dependent code to get the rest to
> compile), no runtime testing at all.
> 
> Let me know if I missed any. Even better, let me know if we can just
> delete some of these entirely.
> 
It's all downhill from here, Omar, I don't see how you are ever going to
top this in terms of impact.

Applied for 4.12!

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Jens Axboe @ 2017-03-28 20:38 UTC (permalink / raw)
  To: Stephen Bates, Sagi Grimberg
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <C52E2CE3-0F92-40EA-902F-A9B70B145F1A@raithlin.com>

On 03/28/2017 01:58 PM, Stephen  Bates wrote:
>>>
>>> Thanks for the review Sagi. I’d be OK going with <=0 as the exact
>>> match would normally be for minimal IO sizes (where <= and = are the
>>> same thing). I will see what other feedback I get and aim to do a
>>> respin soon…
>>
>> No tunables for this, please. There's absolutely no reason why we should
>> need it.
> 
> Jens – by this you mean you want to only bucket IO that are exactly
> the minimum block size supported by the underlying block device? I was
> envisioning we might want to relax that in certain cases (e.g. bucket
> 4KB and below going to a 512B device).

Sorry, the above was a bit terse. I think a much better solution would
be to create a number of buckets (per data direction) and do stats on
all of them. The buckets would cover a reasonable range of request
sizes. Then when you poll for a given request, we can base our timed
number on the data direction AND size of it. You can get pretty far with
a few buckets:

512b
4k
8k
16k
32k
64k
128k

and you could even have your time estimation function turn these into
something sane. Or just use a composite of buckets, if you wish.

-- 
Jens Axboe

^ permalink raw reply

* [PATCH] block-mq: don't re-queue if we get a queue error
From: Josef Bacik @ 2017-03-28 20:37 UTC (permalink / raw)
  To: linux-block, kernel-team

When try to issue a request directly and we fail we will requeue the
request, but call blk_mq_end_request() as well.  This leads to the
completed request being on a queuelist and getting ended twice, which
causes list corruption in schedulers and other shenanigans.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 08a49c6..c5a6985 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1457,8 +1457,6 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 		return;
 	}
 
-	__blk_mq_requeue_request(rq);
-
 	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
 		*cookie = BLK_QC_T_NONE;
 		rq->errors = -EIO;
@@ -1466,6 +1464,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 		return;
 	}
 
+	__blk_mq_requeue_request(rq);
 insert:
 	blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
 }
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Stephen  Bates @ 2017-03-28 19:58 UTC (permalink / raw)
  To: Jens Axboe, Sagi Grimberg
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <88815dc6-04b2-0915-597d-1e335b6d2ba2@kernel.dk>

Pj4gDQo+PiBUaGFua3MgZm9yIHRoZSByZXZpZXcgU2FnaS4gSeKAmWQgYmUgT0sgZ29pbmcgd2l0
aCA8PTAgYXMgdGhlIGV4YWN0DQo+PiBtYXRjaCB3b3VsZCBub3JtYWxseSBiZSBmb3IgbWluaW1h
bCBJTyBzaXplcyAod2hlcmUgPD0gYW5kID0gYXJlIHRoZQ0KPj4gc2FtZSB0aGluZykuIEkgd2ls
bCBzZWUgd2hhdCBvdGhlciBmZWVkYmFjayBJIGdldCBhbmQgYWltIHRvIGRvIGENCj4+IHJlc3Bp
biBzb29u4oCmDQo+DQo+IE5vIHR1bmFibGVzIGZvciB0aGlzLCBwbGVhc2UuIFRoZXJlJ3MgYWJz
b2x1dGVseSBubyByZWFzb24gd2h5IHdlIHNob3VsZA0KPiBuZWVkIGl0Lg0KDQpKZW5zIOKAkyBi
eSB0aGlzIHlvdSBtZWFuIHlvdSB3YW50IHRvIG9ubHkgYnVja2V0IElPIHRoYXQgYXJlIGV4YWN0
bHkgdGhlIG1pbmltdW0gYmxvY2sgc2l6ZSBzdXBwb3J0ZWQgYnkgdGhlIHVuZGVybHlpbmcgYmxv
Y2sgZGV2aWNlPyBJIHdhcyBlbnZpc2lvbmluZyB3ZSBtaWdodCB3YW50IHRvIHJlbGF4IHRoYXQg
aW4gY2VydGFpbiBjYXNlcyAoZS5nLiBidWNrZXQgNEtCIGFuZCBiZWxvdyBnb2luZyB0byBhIDUx
MkIgZGV2aWNlKS4NCg0KU3RlcGhlbg0KDQoNCg==

^ permalink raw reply

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Jens Axboe @ 2017-03-28 19:46 UTC (permalink / raw)
  To: Stephen Bates, Sagi Grimberg
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <25CE7F43-DD1B-42D6-A891-A3D916BD0877@raithlin.com>

On 03/28/2017 01:38 PM, Stephen  Bates wrote:
>>>
>>>  In order to bucket IO for the polling algorithm we use a sysfs entry
>>>  to set the filter value. It is signed and we will use that as follows:
>>>
>>>   0   : No filtering. All IO are considered in stat generation
>>>   > 0 : Filtering based on IO of exactly this size only.
>>>   < 0 : Filtering based on IO less than or equal to -1 time this value.>
>>
>> I'd say that this is a fairly non-trivial semantic meanning to this...
>>
>> Is there any use for the size exact match filter? If not then
>> I suggest we always make it (<=) in its semantics...
> 
> Thanks for the review Sagi. Iï¿½d be OK going with <=0 as the exact
> match would normally be for minimal IO sizes (where <= and = are the
> same thing). I will see what other feedback I get and aim to do a
> respin soonï¿½

No tunables for this, please. There's absolutely no reason why we should
need it.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 2/2] blk-stat: add a poll_size value to the request_queue struct
From: Stephen  Bates @ 2017-03-28 19:38 UTC (permalink / raw)
  To: Sagi Grimberg, axboe@kernel.dk
  Cc: linux-block@vger.kernel.org, Damien.LeMoal@wdc.com,
	osandov@osandov.com, linux-nvme@lists.infradead.org
In-Reply-To: <44296e0a-fb04-888b-aa00-a9b01709e1b8@grimberg.me>

Pj4NCj4+ICBJbiBvcmRlciB0byBidWNrZXQgSU8gZm9yIHRoZSBwb2xsaW5nIGFsZ29yaXRobSB3
ZSB1c2UgYSBzeXNmcyBlbnRyeQ0KPj4gIHRvIHNldCB0aGUgZmlsdGVyIHZhbHVlLiBJdCBpcyBz
aWduZWQgYW5kIHdlIHdpbGwgdXNlIHRoYXQgYXMgZm9sbG93czoNCj4+DQo+PiAgIDAgICA6IE5v
IGZpbHRlcmluZy4gQWxsIElPIGFyZSBjb25zaWRlcmVkIGluIHN0YXQgZ2VuZXJhdGlvbg0KPj4g
ICA+IDAgOiBGaWx0ZXJpbmcgYmFzZWQgb24gSU8gb2YgZXhhY3RseSB0aGlzIHNpemUgb25seS4N
Cj4+ICAgPCAwIDogRmlsdGVyaW5nIGJhc2VkIG9uIElPIGxlc3MgdGhhbiBvciBlcXVhbCB0byAt
MSB0aW1lIHRoaXMgdmFsdWUuPg0KPg0KPiBJJ2Qgc2F5IHRoYXQgdGhpcyBpcyBhIGZhaXJseSBu
b24tdHJpdmlhbCBzZW1hbnRpYyBtZWFubmluZyB0byB0aGlzLi4uDQo+DQo+IElzIHRoZXJlIGFu
eSB1c2UgZm9yIHRoZSBzaXplIGV4YWN0IG1hdGNoIGZpbHRlcj8gSWYgbm90IHRoZW4NCj4gSSBz
dWdnZXN0IHdlIGFsd2F5cyBtYWtlIGl0ICg8PSkgaW4gaXRzIHNlbWFudGljcy4uLg0KDQpUaGFu
a3MgZm9yIHRoZSByZXZpZXcgU2FnaS4gSeKAmWQgYmUgT0sgZ29pbmcgd2l0aCA8PTAgYXMgdGhl
IGV4YWN0IG1hdGNoIHdvdWxkIG5vcm1hbGx5IGJlIGZvciBtaW5pbWFsIElPIHNpemVzICh3aGVy
ZSA8PSBhbmQgPSBhcmUgdGhlIHNhbWUgdGhpbmcpLiBJIHdpbGwgc2VlIHdoYXQgb3RoZXIgZmVl
ZGJhY2sgSSBnZXQgYW5kIGFpbSB0byBkbyBhIHJlc3BpbiBzb29u4oCmDQoNClN0ZXBoZW4NCg0K

^ permalink raw reply

* Re: [PATCH 03/23] sd: implement REQ_OP_WRITE_ZEROES
From: Mike Snitzer @ 2017-03-28 19:33 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: agk@redhat.com, lars.ellenberg@linbit.com, hch@lst.de,
	martin.petersen@oracle.com, philipp.reisner@linbit.com,
	axboe@kernel.dk, shli@kernel.org, linux-scsi@vger.kernel.org,
	dm-devel@redhat.com, drbd-dev@lists.linbit.com,
	linux-block@vger.kernel.org, linux-raid@vger.kernel.org
In-Reply-To: <1490726988.2573.16.camel@sandisk.com>

On Tue, Mar 28 2017 at  2:50pm -0400,
Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:

> On Thu, 2017-03-23 at 10:33 -0400, Christoph Hellwig wrote:
> > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> > index af632e350ab4..b6f70a09a301 100644
> > --- a/drivers/scsi/sd.c
> > +++ b/drivers/scsi/sd.c
> > @@ -748,7 +748,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
> >  	return scsi_init_io(cmd);
> >  }
> >  
> > -static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> > +static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
> >  {
> >  	struct scsi_device *sdp = cmd->device;
> >  	struct request *rq = cmd->request;
> > @@ -765,13 +765,14 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> >  
> >  	cmd->cmd_len = 16;
> >  	cmd->cmnd[0] = WRITE_SAME_16;
> > -	cmd->cmnd[1] = 0x8; /* UNMAP */
> > +	if (unmap)
> > +		cmd->cmnd[1] = 0x8; /* UNMAP */
> >  	put_unaligned_be64(sector, &cmd->cmnd[2]);
> >  	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
> 
> Hello Christoph,
> 
> A quote from SBC: "An OPTIMAL UNMAP GRANULARITY field set to a non-zero value
> indicates the optimal granularity in logical blocks for unmap requests (e.g.,
> an UNMAP command or a WRITE SAME (16) command with the UNMAP bit set to one).
> An unmap request with a number of logical blocks that is not a multiple of
> this value may result in unmap operations on fewer LBAs than requested."
> 
> This means that just like the start and end of a discard must be aligned on a
> discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
> also respect that granularity. I think this means that either
> __blkdev_issue_zeroout() has to be modified such that it rejects unaligned
> REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
> modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.

That'd get DM thinp off the hook from having to zero the unaligned start
and tail...

^ permalink raw reply

* Re: [PATCH 03/23] sd: implement REQ_OP_WRITE_ZEROES
From: Bart Van Assche @ 2017-03-28 18:50 UTC (permalink / raw)
  To: agk@redhat.com, lars.ellenberg@linbit.com, snitzer@redhat.com,
	hch@lst.de, martin.petersen@oracle.com,
	philipp.reisner@linbit.com, axboe@kernel.dk, shli@kernel.org
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	drbd-dev@lists.linbit.com, linux-block@vger.kernel.org,
	linux-raid@vger.kernel.org
In-Reply-To: <20170323143341.31549-4-hch@lst.de>

On Thu, 2017-03-23 at 10:33 -0400, Christoph Hellwig wrote:
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index af632e350ab4..b6f70a09a301 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -748,7 +748,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
>  	return scsi_init_io(cmd);
>  }
> =20
> -static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> +static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
>  {
>  	struct scsi_device *sdp =3D cmd->device;
>  	struct request *rq =3D cmd->request;
> @@ -765,13 +765,14 @@ static int sd_setup_write_same16_cmnd(struct scsi_c=
mnd *cmd)
> =20
>  	cmd->cmd_len =3D 16;
>  	cmd->cmnd[0] =3D WRITE_SAME_16;
> -	cmd->cmnd[1] =3D 0x8; /* UNMAP */
> +	if (unmap)
> +		cmd->cmnd[1] =3D 0x8; /* UNMAP */
>  	put_unaligned_be64(sector, &cmd->cmnd[2]);
>  	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);

Hello Christoph,

A quote from SBC: "An OPTIMAL UNMAP GRANULARITY field set to a non-zero val=
ue
indicates the optimal granularity in logical blocks for unmap requests (e.g=
.,
an UNMAP command or a WRITE SAME (16) command with the UNMAP bit set to one=
).
An unmap request with a number of logical blocks that is not a multiple of
this value may result in unmap operations on fewer LBAs than requested."

This means that just like the start and end of a discard must be aligned on=
 a
discard_granularity boundary, WRITE SAME commands with the UNMAP bit set mu=
st
also respect that granularity. I think this means that either
__blkdev_issue_zeroout() has to be modified such that it rejects unaligned
REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
modified such that it generates REQ_OP_WRITEs for the unaligned start and t=
ail.

Bart.=

^ permalink raw reply

* Re: [PATCH 23/23] block: remove the discard_zeroes_data flag
From: Bart Van Assche @ 2017-03-28 17:00 UTC (permalink / raw)
  To: agk@redhat.com, lars.ellenberg@linbit.com, snitzer@redhat.com,
	hch@lst.de, martin.petersen@oracle.com,
	philipp.reisner@linbit.com, axboe@kernel.dk, shli@kernel.org
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	drbd-dev@lists.linbit.com, linux-block@vger.kernel.org,
	linux-raid@vger.kernel.org
In-Reply-To: <20170323143341.31549-24-hch@lst.de>

On Thu, 2017-03-23 at 10:33 -0400, Christoph Hellwig wrote:
> Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we ca=
n
> kill this hack.
>=20
> [ ... ]
>
> diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/te=
sting/sysfs-block
> index 2da04ce6aeef..dea212db9df3 100644
> --- a/Documentation/ABI/testing/sysfs-block
> +++ b/Documentation/ABI/testing/sysfs-block
> @@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
>  Date:		May 2011
>  Contact:	Martin K. Petersen <martin.petersen@oracle.com>
>  Description:
> -		Devices that support discard functionality may return
> -		stale or random data when a previously discarded block
> -		is read back. This can cause problems if the filesystem
> -		expects discarded blocks to be explicitly cleared. If a
> -		device reports that it deterministically returns zeroes
> -		when a discarded area is read the discard_zeroes_data
> -		parameter will be set to one. Otherwise it will be 0 and
> -		the result of reading a discarded area is undefined.
> +		Will always return 0.  Don't rely on any specific behavior
> +		for discards, and don't read this file.
> =20
>  What:		/sys/block/<disk>/queue/write_same_max_bytes
>  Date:		January 2012
>
> [ ... ]
>
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request=
_queue *q,
> =20
>  static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, c=
har *page)
>  {
> -	return queue_var_show(queue_discard_zeroes_data(q), page);
> +	return 0;
>  }

Hello Christoph,

It seems to me like the documentation in Documentation/ABI/testing/sysfs-bl=
ock
and the above code are not in sync. I think the above code will cause readi=
ng
from the discard_zeroes_data attribute to return an empty string ("") inste=
ad
of "0\n".

BTW, my personal preference is to remove this attribute entirely because ke=
eping
it will cause confusion, no matter how well we document the behavior of thi=
s
attribute.

Thanks,

Bart.=

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox