Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 8/9] nowait aio: btrfs
From: Goldwyn Rodrigues @ 2017-04-11 14:26 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <20170411142619.27205-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Return EAGAIN if any of the following checks fail
 + i_rwsem is not lockable
 + NODATACOW or PREALLOC is not set
 + Cannot nocow at the desired location
 + Writing beyond end of file which is not allocated

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/file.c  | 25 ++++++++++++++++++++-----
 fs/btrfs/inode.c |  3 +++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb7230b2d..a870e5dd2b4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1823,12 +1823,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	ssize_t num_written = 0;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 	ssize_t err;
-	loff_t pos;
-	size_t count;
+	loff_t pos = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
 	loff_t oldsize;
 	int clean_page = 0;
 
-	inode_lock(inode);
+	if ((iocb->ki_flags & IOCB_NOWAIT) &&
+			(iocb->ki_flags & IOCB_DIRECT)) {
+		/* Don't sleep on inode rwsem */
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+		/*
+		 * We will allocate space in case nodatacow is not set,
+		 * so bail
+		 */
+		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC)) ||
+		    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+			inode_unlock(inode);
+			return -EAGAIN;
+		}
+	} else
+		inode_lock(inode);
+
 	err = generic_write_checks(iocb, from);
 	if (err <= 0) {
 		inode_unlock(inode);
@@ -1862,8 +1879,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 */
 	update_time_for_write(inode);
 
-	pos = iocb->ki_pos;
-	count = iov_iter_count(from);
 	start_pos = round_down(pos, fs_info->sectorsize);
 	oldsize = i_size_read(inode);
 	if (start_pos > oldsize) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a18510be76c1..d91b21a76d6d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8627,6 +8627,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			dio_data.overwrite = 1;
 			inode_unlock(inode);
 			relock = true;
+		} else if (iocb->ki_flags & IOCB_NOWAIT) {
+			ret = -EAGAIN;
+			goto out;
 		}
 		ret = btrfs_delalloc_reserve_space(inode, offset, count);
 		if (ret)
-- 
2.12.0

^ permalink raw reply related

* [PATCH 9/9] nowait aio: Return -EOPNOTSUPP if filesystem does not support
From: Goldwyn Rodrigues @ 2017-04-11 14:26 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: jack, hch, linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi,
	avi, axboe, linux-api, willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <20170411142619.27205-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

The check is in generic_file_write_iter(), which is called by
most filesystems, either through fsops.write_iter() or through
the function defined by write_iter(). If not, we perform the
check in the defined .write_iter() function which is called
for direct IO.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/9p/vfs_file.c | 3 +++
 fs/ceph/file.c   | 3 +++
 fs/cifs/file.c   | 3 +++
 fs/fuse/file.c   | 3 +++
 fs/nfs/direct.c  | 3 +++
 fs/ocfs2/file.c  | 3 +++
 mm/filemap.c     | 3 +++
 7 files changed, 21 insertions(+)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3de3b4a89d89..403681db7723 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	loff_t origin;
 	int err = 0;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	retval = generic_write_checks(iocb, from);
 	if (retval <= 0)
 		return retval;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..af28419b1731 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1267,6 +1267,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	int err, want, got;
 	loff_t pos;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index aa3debbba826..a828ab3e7775 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2638,6 +2638,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
 	 * write request.
 	 */
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	rc = generic_write_checks(iocb, from);
 	if (rc <= 0)
 		return rc;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..72786e798319 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1425,6 +1425,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
 	ssize_t res;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	if (is_bad_inode(inode))
 		return -EIO;
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aab32fc3d6a8..ab419caebd5f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -991,6 +991,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	result = generic_write_checks(iocb, iter);
 	if (result <= 0)
 		return result;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..e7f8ba890305 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2235,6 +2235,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 	if (count == 0)
 		return 0;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
 	inode_lock(inode);
diff --git a/mm/filemap.c b/mm/filemap.c
index 46e01b8f6880..48b83d1d4a30 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3026,6 +3026,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0)
-- 
2.12.0

^ permalink raw reply related

* [PATCH v5] lightnvn: pblk
From: Javier González @ 2017-04-11 14:31 UTC (permalink / raw)
  To: mb; +Cc: linux-block, linux-kernel, Bart.VanAssche, Javier González

Changes since v4:
* Rebase on top of Matias' for-4.12/core
* Fix type implicit conversions reported by sparse (reported by Bart Van
  Assche)
* Make error and debug statistics long atomic variables.

Changes since v3:
* Apply Bart's feedback [1]
* Implement dynamic L2P optimizations for > 32-bit physical media
  geometry (from Matias Bjørling)
* Fix memory leak on GC (Reported by Simon A. F. Lund)
* 8064 is a perfectly round number of lines :)

[1] https://lkml.org/lkml/2017/4/8/172

Changes since v2:
* Rebase on top of Matias' for-4.12/core
* Implement L2P scan recovery to recover L2P table in case of power
failure.
* Re-design disk format to be more flexible in future versions (from
 Matias Bjørling)
* Implement per-instance uuid to allow correct recovery without
 forcing line erases (from Matias Bjørling)
* Re-design GC threading to have several GC readers and a single
  writer that places data on the write buffer. This allows to maximize
  the GC write buffer budget without having unnecessary GC writers
  competing for the write buffer lock.
* Simplify sysfs interface.
* Refactoring and several code improvements (together with Matias
  Bjørling)

Changes since v1:
* Rebase on top of Matias' for-4.12/core
* Move from per-LUN block allocation to a line model. This means that a
  whole lines across all LUNs is allocated at a time. Data is still
  stripped in a round-robin fashion at a page granurality.
* Implement new disk format scheme, where metadata is stored per line
  instead of per LUN. This allows for space optimizations.
* Improvements on GC workqueue management and victim selection.
* Implement sysfs interface to query pblk's operation and statistics.
* Implement a user - GC I/O rate-limiter
* Various bug fixes

Javier González (1):
  lightnvm: physical block device (pblk) target

 Documentation/lightnvm/pblk.txt  |   21 +
 drivers/lightnvm/Kconfig         |    9 +
 drivers/lightnvm/Makefile        |    5 +
 drivers/lightnvm/pblk-cache.c    |  114 +++
 drivers/lightnvm/pblk-core.c     | 1655 ++++++++++++++++++++++++++++++++++++++
 drivers/lightnvm/pblk-gc.c       |  555 +++++++++++++
 drivers/lightnvm/pblk-init.c     |  949 ++++++++++++++++++++++
 drivers/lightnvm/pblk-map.c      |  139 ++++
 drivers/lightnvm/pblk-rb.c       |  852 ++++++++++++++++++++
 drivers/lightnvm/pblk-read.c     |  529 ++++++++++++
 drivers/lightnvm/pblk-recovery.c |  998 +++++++++++++++++++++++
 drivers/lightnvm/pblk-rl.c       |  182 +++++
 drivers/lightnvm/pblk-sysfs.c    |  507 ++++++++++++
 drivers/lightnvm/pblk-write.c    |  411 ++++++++++
 drivers/lightnvm/pblk.h          | 1121 ++++++++++++++++++++++++++
 15 files changed, 8047 insertions(+)
 create mode 100644 Documentation/lightnvm/pblk.txt
 create mode 100644 drivers/lightnvm/pblk-cache.c
 create mode 100644 drivers/lightnvm/pblk-core.c
 create mode 100644 drivers/lightnvm/pblk-gc.c
 create mode 100644 drivers/lightnvm/pblk-init.c
 create mode 100644 drivers/lightnvm/pblk-map.c
 create mode 100644 drivers/lightnvm/pblk-rb.c
 create mode 100644 drivers/lightnvm/pblk-read.c
 create mode 100644 drivers/lightnvm/pblk-recovery.c
 create mode 100644 drivers/lightnvm/pblk-rl.c
 create mode 100644 drivers/lightnvm/pblk-sysfs.c
 create mode 100644 drivers/lightnvm/pblk-write.c
 create mode 100644 drivers/lightnvm/pblk.h

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH 1/3] lightnvm: clean unused variable
From: Matias Bjørling @ 2017-04-11 14:35 UTC (permalink / raw)
  To: Javier González; +Cc: linux-block, linux-kernel, Javier González
In-Reply-To: <1491920338-2348-2-git-send-email-javier@cnexlabs.com>

On 04/11/2017 04:18 PM, Javier González wrote:
> Clean unused variable on lightnvm core.
>
> Signed-off-by: Javier González <javier@cnexlabs.com>
> ---
>  drivers/lightnvm/core.c | 3 ---
>  1 file changed, 3 deletions(-)
>
> diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
> index eb9ab1a..258007a 100644
> --- a/drivers/lightnvm/core.c
> +++ b/drivers/lightnvm/core.c
> @@ -501,7 +501,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
>  		int *lun_roffs;
>  		struct ppa_addr gaddr;
>  		u64 pba = le64_to_cpu(entries[i]);
> -		int off;
>  		u64 diff;
>
>  		if (!pba)
> @@ -511,8 +510,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
>  		ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
>  		lun_roffs = ch_rmap->lun_offs;
>
> -		off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
> -
>  		diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
>  				(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
>
>
Thanks. Applied for 4.12.

^ permalink raw reply

* Re: [PATCH 2/3] lightnvm: fix type checks on rrpc
From: Matias Bjørling @ 2017-04-11 14:35 UTC (permalink / raw)
  To: Javier González; +Cc: linux-block, linux-kernel, Javier González
In-Reply-To: <1491920338-2348-4-git-send-email-javier@cnexlabs.com>

On 04/11/2017 04:18 PM, Javier González wrote:
> sector_t is always unsigned, therefore avoid < 0 checks on it.
>
> Signed-off-by: Javier González <javier@cnexlabs.com>
> ---
>  drivers/lightnvm/rrpc.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
> index 5dba544..cf0e28a 100644
> --- a/drivers/lightnvm/rrpc.c
> +++ b/drivers/lightnvm/rrpc.c
> @@ -817,7 +817,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
>
>  	for (i = 0; i < npages; i++) {
>  		/* We assume that mapping occurs at 4KB granularity */
> -		BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects));
> +		BUG_ON(!(laddr + i < rrpc->nr_sects));
>  		gp = &rrpc->trans_map[laddr + i];
>
>  		if (gp->rblk) {
> @@ -846,7 +846,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
>  	if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
>  		return NVM_IO_REQUEUE;
>
> -	BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects));
> +	BUG_ON(!(laddr < rrpc->nr_sects));
>  	gp = &rrpc->trans_map[laddr];
>
>  	if (gp->rblk) {
>
Thanks. Applied for 4.12.

^ permalink raw reply

* Re: [PATCH 3/3] lightnvm: convert sprintf into strlcpy
From: Matias Bjørling @ 2017-04-11 14:35 UTC (permalink / raw)
  To: Javier González; +Cc: linux-block, linux-kernel, Javier González
In-Reply-To: <1491920338-2348-5-git-send-email-javier@cnexlabs.com>

On 04/11/2017 04:18 PM, Javier González wrote:
> Convert sprintf calls to strlcpy in order to make possible buffer
> overflow more obvious.
>
> Signed-off-by: Javier González <javier@cnexlabs.com>
> ---
>  drivers/lightnvm/core.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
> index 258007a..2c26af3 100644
> --- a/drivers/lightnvm/core.c
> +++ b/drivers/lightnvm/core.c
> @@ -273,7 +273,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
>  		goto err_disk;
>  	blk_queue_make_request(tqueue, tt->make_rq);
>
> -	sprintf(tdisk->disk_name, "%s", create->tgtname);
> +	strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
>  	tdisk->flags = GENHD_FL_EXT_DEVT;
>  	tdisk->major = 0;
>  	tdisk->first_minor = 0;
> @@ -1198,13 +1198,13 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
>  	list_for_each_entry(dev, &nvm_devices, devices) {
>  		struct nvm_ioctl_device_info *info = &devices->info[i];
>
> -		sprintf(info->devname, "%s", dev->name);
> +		strlcpy(info->devname, dev->name, sizeof(info->devname));
>
>  		/* kept for compatibility */
>  		info->bmversion[0] = 1;
>  		info->bmversion[1] = 0;
>  		info->bmversion[2] = 0;
> -		sprintf(info->bmname, "%s", "gennvm");
> +		strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
>  		i++;
>
>  		if (i > 31) {
>
Thanks. Applied for 4.12.

^ permalink raw reply

* Re: [PATCH V3 00/16] Introduce the BFQ I/O scheduler
From: Bart Van Assche @ 2017-04-11 14:37 UTC (permalink / raw)
  To: tj@kernel.org, paolo.valente@linaro.org, axboe@kernel.dk
  Cc: ulf.hansson@linaro.org, linux-kernel@vger.kernel.org,
	fchecconi@gmail.com, avanzini.arianna@gmail.com,
	linux-block@vger.kernel.org, linus.walleij@linaro.org,
	broonie@kernel.org
In-Reply-To: <20170411134315.44135-1-paolo.valente@linaro.org>

T24gVHVlLCAyMDE3LTA0LTExIGF0IDE1OjQyICswMjAwLCBQYW9sbyBWYWxlbnRlIHdyb3RlOg0K
PiBuZXcgcGF0Y2ggc2VyaWVzLCBhZGRyZXNzaW5nIChib3RoKSBpc3N1ZXMgcmFpc2VkIGJ5IEJh
cnQgWzFdLg0KDQpIZWxsbyBQYW9sbywNCg0KSXMgdGhlcmUgYSBnaXQgdHJlZSBhdmFpbGFibGUg
c29tZXdoZXJlIHdpdGggdGhlc2UgcGF0Y2hlcyBhbmQgd2l0aG91dA0KdGhlIHNpbmdsZSBxdWV1
ZSBCRlEgc2NoZWR1bGVyPw0KDQpUaGFua3MsDQoNCkJhcnQu

^ permalink raw reply

* Re: [PATCH v5] lightnvm: physical block device (pblk) target
From: Matias Bjørling @ 2017-04-11 14:41 UTC (permalink / raw)
  To: Javier González
  Cc: linux-block, linux-kernel, Bart.VanAssche, Javier González
In-Reply-To: <1491921077-9377-2-git-send-email-javier@cnexlabs.com>

On 04/11/2017 04:31 PM, Javier González wrote:
> This patch introduces pblk, a host-side translation layer for
> Open-Channel SSDs to expose them like block devices. The translation
> layer allows data placement decisions, and I/O scheduling to be
> managed by the host, enabling users to optimize the SSD for their
> specific workloads.
>
> An open-channel SSD has a set of LUNs (parallel units) and a
> collection of blocks. Each block can be read in any order, but
> writes must be sequential. Writes may also fail, and if a block
> requires it, must also be reset before new writes can be
> applied.
>
> To manage the constraints, pblk maintains a logical to
> physical address (L2P) table,  write cache, garbage
> collection logic, recovery scheme, and logic to rate-limit
> user I/Os versus garbage collection I/Os.
>
> The L2P table is fully-associative and manages sectors at a
> 4KB granularity. Pblk stores the L2P table in two places, in
> the out-of-band area of the media and on the last page of a
> line. In the cause of a power failure, pblk will perform a
> scan to recover the L2P table.
>
> The user data is organized into lines. A line is data
> striped across blocks and LUNs. The lines enable the host to
> reduce the amount of metadata to maintain besides the user
> data and makes it easier to implement RAID or erasure coding
> in the future.
>
> pblk implements multi-tenant support and can be instantiated
> multiple times on the same drive. Each instance owns a
> portion of the SSD - both regarding I/O bandwidth and
> capacity - providing I/O isolation for each case.
>
> Finally, pblk also exposes a sysfs interface that allows
> user-space to peek into the internals of pblk. The interface
> is available at /dev/block/*/pblk/ where * is the block
> device name exposed.
>
> This work also contains contributions from:
>   Matias Bjørling <matias@cnexlabs.com>
>   Simon A. F. Lund <slund@cnexlabs.com>
>   Young Tack Jin <youngtack.jin@gmail.com>
>   Huaicheng Li <huaicheng@cs.uchicago.edu>
>
> Signed-off-by: Javier González <javier@cnexlabs.com>

Awesome. Applied for 4.12.

^ permalink raw reply

* Re: [PATCH v5] lightnvn: pblk
From: Bart Van Assche @ 2017-04-11 15:19 UTC (permalink / raw)
  To: mb@lightnvm.io, jg@lightnvm.io
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	javier@cnexlabs.com
In-Reply-To: <1491921077-9377-1-git-send-email-javier@cnexlabs.com>

On Tue, 2017-04-11 at 16:31 +0200, Javier Gonz=E1lez wrote:
> Changes since v4:
> * Rebase on top of Matias' for-4.12/core
> * Fix type implicit conversions reported by sparse (reported by Bart Van
>   Assche)
> * Make error and debug statistics long atomic variables.

Hello Javier,

Thanks for the quick respin. But have you already had a look at the
diagnostics reported by smatch? Smatch reports e.g.

drivers/lightnvm/pblk-rb.c:783: pblk_rb_tear_down_check() error: we previou=
sly assumed 'rb->entries' could be null (see line 779)

on the following code:

	if (rb->entries)
		goto out;

	for (i =3D 0; i < rb->nr_entries; i++) {
		entry =3D &rb->entries[i];

		if (entry->data)
			goto out;
	}

Is that "if (rb->entries)" check correct or should that perhaps been
"if (!rb->entries)"? Smatch is available at http://repo.or.cz/w/smatch.git.

Bart.=

^ permalink raw reply

* Re: WBT for SQ devices interface
From: Holger Hoffstätte @ 2017-04-11 15:16 UTC (permalink / raw)
  To: linux-block
In-Reply-To: <20170411093827.GC2471@quack2.suse.cz>

On Tue, 11 Apr 2017 11:38:27 +0200, Jan Kara wrote:

> when testing my fix for 0-day reports with writeback throttling I came
> across somewhat unexpected behavior with user interface of writeback
> throttling. So currently if CFQ is used as an IO scheduler, we disable
> writeback throttling because they don't go well together. However when user
> has CONFIG_BLK_WBT_SQ=y and switches IO scheduler to NOOP or DEADLINE
> writeback throttling still stays disabled. This is somewhat unexpected
> especially because the switching of IO scheduler from CFQ to something else
> can have happened behind user's back by some udev rule or so. So basically
> CONFIG_DEFAULT_CFQ=y and CONFIG_BLK_WBT_SQ=y don't make sense together.

:(

> So do people thing we should enable WBT if CONFIG_BLK_WBT_SQ=y and IO
> scheduler is switched from CFQ to something else?

Yes please. If a scheduler doesn't like WBT then that is its own problem.
Deadline with WBT has worked fine for me.

-h

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Mike Snitzer @ 2017-04-11 16:09 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jens Axboe, linux-block, linux-scsi, dm-devel
In-Reply-To: <20170407181654.27836-7-bart.vanassche@sandisk.com>

On Fri, Apr 07 2017 at  2:16pm -0400,
Bart Van Assche <bart.vanassche@sandisk.com> wrote:

> While running the srp-test software I noticed that request
> processing stalls sporadically at the beginning of a test, namely
> when mkfs is run against a dm-mpath device. Every time when that
> happened the following command was sufficient to resume request
> processing:
> 
>     echo run >/sys/kernel/debug/block/dm-0/state
> 
> This patch avoids that such request processing stalls occur. The
> test I ran is as follows:
> 
>     while srp-test/run_tests -d -r 30 -t 02-mq; do :; done
> 
> Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
> Cc: Mike Snitzer <snitzer@redhat.com>
> Cc: dm-devel@redhat.com
> ---
>  drivers/md/dm-rq.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
> index 6886bf160fb2..d19af1d21f4c 100644
> --- a/drivers/md/dm-rq.c
> +++ b/drivers/md/dm-rq.c
> @@ -755,6 +755,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		/* Undo dm_start_request() before requeuing */
>  		rq_end_stats(md, rq);
>  		rq_completed(md, rq_data_dir(rq), false);
> +		blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
>  		return BLK_MQ_RQ_QUEUE_BUSY;
>  	}
>  
> -- 
> 2.12.0
> 

I really appreciate your hard work Bart but this looks like a cheap
hack.

I'm clearly too late to stop this from going in (given Jens got it
merged for -rc6) but: this has no place in dm-mq (or any blk-mq
driver).  If it is needed it should be elevated to blk-mq core to
trigger blk_mq_delay_run_hw_queue() when BLK_MQ_RQ_QUEUE_BUSY is
returned from blk_mq_ops' .queue_rq.

If this dm-mq specific commit is justified the case certainly is spelled
out in the commit header.

^ permalink raw reply

* Re: WBT for SQ devices interface
From: Jens Axboe @ 2017-04-11 16:22 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-block
In-Reply-To: <20170411093827.GC2471@quack2.suse.cz>

On Apr 11, 2017, at 3:38 AM, Jan Kara <jack@suse.cz> wrote:
>=20
> Hi,
>=20
> when testing my fix for 0-day reports with writeback throttling I came
> across somewhat unexpected behavior with user interface of writeback
> throttling. So currently if CFQ is used as an IO scheduler, we disable
> writeback throttling because they don't go well together. However when use=
r
> has CONFIG_BLK_WBT_SQ=3Dy and switches IO scheduler to NOOP or DEADLINE
> writeback throttling still stays disabled. This is somewhat unexpected
> especially because the switching of IO scheduler from CFQ to something els=
e
> can have happened behind user's back by some udev rule or so. So basically=

> CONFIG_DEFAULT_CFQ=3Dy and CONFIG_BLK_WBT_SQ=3Dy don't make sense together=
.
>=20
> So do people thing we should enable WBT if CONFIG_BLK_WBT_SQ=3Dy and IO
> scheduler is switched from CFQ to something else?

Yes, I think so. We should have consistent behavior in that area, regardless=
 of the starting point.

--=20
Jens Axboe

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Bart Van Assche @ 2017-04-11 16:26 UTC (permalink / raw)
  To: snitzer@redhat.com
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, axboe@kernel.dk
In-Reply-To: <20170411160958.GA19222@redhat.com>

On Tue, 2017-04-11 at 12:09 -0400, Mike Snitzer wrote:
> This has no place in dm-mq (or any blk-mq
> driver).  If it is needed it should be elevated to blk-mq core to
> trigger blk_mq_delay_run_hw_queue() when BLK_MQ_RQ_QUEUE_BUSY is
> returned from blk_mq_ops' .queue_rq.

Hello Mike,

If the blk-mq core would have to figure out whether or not a queue is no
longer busy without any cooperation from the blk-mq driver all the blk-mq
core could do is to attempt to rerun that queue from time to time. But at
which intervals should the blk-mq core check whether or not a queue is stil=
l
busy? Would it be possible to choose intervals at which to check the queue
state that work well for all block drivers? Consider e.g. at the dm-mpath
driver. multipath_busy() returns true as long as path initialization is in
progress. Path initialization can take a long time. The (indirect) call to
blk_mq_run_queue() from pg_init_done() resumes request processing immediate=
ly
after path initialization has finished. Sorry but I don't think it is possi=
ble
to invent an algorithm for the blk-mq core that guarantees not only that a
queue is rerun as soon as it is no longer busy but also that avoids that
plenty of CPU cycles are wasted by the blk-mq core for checking whether a
queue is no longer busy.

Bart.=

^ permalink raw reply

* Re: bfq-mq performance comparison to cfq
From: Andreas Herrmann @ 2017-04-11 16:31 UTC (permalink / raw)
  To: Paolo Valente; +Cc: Jens Axboe, linux-block, linux-kernel
In-Reply-To: <82BCEB46-8D05-42DA-AE06-3426895A7842@linaro.org>

On Mon, Apr 10, 2017 at 11:55:43AM +0200, Paolo Valente wrote:
> 
> > Il giorno 10 apr 2017, alle ore 11:05, Andreas Herrmann <aherrmann@suse.com> ha scritto:
> > 
> > Hi Paolo,
> > 
> > I've looked at your WIP branch as of 4.11.0-bfq-mq-rc4-00155-gbce0818
> > and did some fio tests to compare the behavior to CFQ.
> > 
> > My understanding is that bfq-mq is supposed to be merged sooner or
> > later and then it will be the only reasonable I/O scheduler with
> > blk-mq for rotational devices. Hence I think it is interesting to see
> > what to expect performance-wise in comparison to CFQ which is usually
> > used for such devices with the legacy block layer.
> > 
> > I've just done simple tests iterating over number of jobs (1-8 as the
> > test system had 8 CPUs) for all (random/sequential) read/write
> > patterns. Fixed set of fio parameters used were '-size=5G
> > --group_reporting --ioengine=libaio --direct=1 --iodepth=1
> > --runtime=10'.
> > 
> > I've done 10 runs for each such configuration. The device used was an
> > older SAMSUNG HD103SJ 1TB disk, SATA attached. Results that stick out
> > the most are those for sequential reads and sequential writes:
> > 
> > * sequential reads
> >  [0] - cfq, intel_pstate driver, powersave governor
> >  [1] - bfq_mq, intel_pstate driver, powersave governor
> > 
> > jo             [0]               [1]
> > bs       mean     stddev    mean       stddev
> >  1 & 17060.300 &  77.090 & 17657.500 &  69.602
> >  2 & 15318.200 &  28.817 & 10678.000 & 279.070
> >  3 & 15403.200 &  42.762 &  9874.600 &  93.436
> >  4 & 14521.200 & 624.111 &  9918.700 & 226.425
> >  5 & 13893.900 & 144.354 &  9485.000 & 109.291
> >  6 & 13065.300 & 180.608 &  9419.800 &  75.043
> >  7 & 12169.600 &  95.422 &  9863.800 & 227.662
> >  8 & 12422.200 & 215.535 & 15335.300 & 245.764

For the sake of completeness here the corresponding results when
setting low_latency=0 for sequential reads

 [1] - bfq_mq, intel_pstate driver, powersave governor, low_latency=1 (default)
 [2] - bfq_mq, intel_pstate driver, powersave governor, low_latency=0

jo             [2]               [1]
bs       mean     stddev    mean       stddev
 1 & 17959.500 &  62.376 & 17657.500 &  69.602
 2 & 16137.200 & 696.527 & 10678.000 & 279.070
 3 & 16223.600 &  41.291 &  9874.600 &  93.436
 4 & 16012.200 &  88.924 &  9918.700 & 226.425
 5 & 15937.900 &  51.172 &  9485.000 & 109.291
 6 & 15849.300 &  54.021 &  9419.800 &  75.043
 7 & 15794.300 &  98.857 &  9863.800 & 227.662
 8 & 15494.800 & 895.513 & 15335.300 & 245.764

> > * sequential writes
> >  [0] - cfq, intel_pstate driver, powersave governor
> >  [1] - bfq_mq, intel_pstate driver, powersave governor
> > 
> > jo            [0]               [1]
> > bs      mean     stddev    mean       stddev
> >  1 & 14171.300 & 80.796 & 14392.500 & 182.587
> >  2 & 13520.000 & 88.967 &  9565.400 & 119.400
> >  3 & 13396.100 & 44.936 &  9284.000 &  25.122
> >  4 & 13139.800 & 62.325 &  8846.600 &  45.926
> >  5 & 12942.400 & 45.729 &  8568.700 &  35.852
> >  6 & 12650.600 & 41.283 &  8275.500 & 199.273
> >  7 & 12475.900 & 43.565 &  8252.200 &  33.145
> >  8 & 12307.200 & 43.594 & 13617.500 & 127.773

... and for sequential writes

 [1] - bfq_mq, intel_pstate driver, powersave governor, low_latency=1 (default)
 [2] - bfq_mq, intel_pstate driver, powersave governor, low_latency=0

jo             [2]               [1]
bs       mean     stddev    mean       stddev

 1 & 14444.800 & 248.806 & 14392.500 & 182.587
 2 & 13929.300 &  89.137 &  9565.400 & 119.400
 3 & 13875.400 &  83.084 &  9284.000 &  25.122
 4 & 13845.000 & 106.445 &  8846.600 &  45.926
 5 & 13784.800 &  66.304 &  8568.700 &  35.852
 6 & 13774.900 &  51.845 &  8275.500 & 199.273
 7 & 13741.900 &  92.647 &  8252.200 &  33.145
 8 & 13732.400 &  88.575 & 13617.500 & 127.773

> > With performance instead of powersave governor results were
> > (expectedly) higher but the pattern was the same -- bfq-mq shows a
> > "dent" for tests with 2-7 fio jobs. At the moment I have no
> > explanation for this behavior.
> > 
> 
> I have :)
> 
> BFQ, by default, is configured to privilege latency over throughput.
> In this respect, as various people and I happened to discuss a few
> times, even on these mailing lists, the only way to provide strong
> low-latency guarantees, at the moment, is through device idling.  The
> throughput loss you see is very likely to be the consequence of that
> idling.
> 
> Why does the throughput go back up at eight jobs?  Because, if many
> processes are born in a very short time interval, then BFQ understands
> that some multi-job task is being started.  And these parallel tasks
> usually prefer overall high throughput to single-process low latency.
> Then, BFQ does not idle the device for these processes.

Thanks for the explanation!

> That said, if you do always want maximum throughput, even at the
> expense of latency, then just switch off low-latency heuristics, i.e.,
> set low_latency to 0.

That helped a lot. (See above.)

> Depending on the device, setting slice_ilde to 0 may help a lot too
> (as well as with CFQ).  If the throughput is still low also after
> forcing BFQ to an only-throughput mode, then you hit some bug, and
> I'll have a little more work to do ...


Thanks,

Andreas

^ permalink raw reply

* Re: RFC: remove REQ_OP_WRITE_SAME
From: Mike Christie @ 2017-04-11 16:49 UTC (permalink / raw)
  To: Christoph Hellwig, axboe, martin.petersen, philipp.reisner,
	lars.ellenberg, target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

On 04/10/2017 11:07 AM, Christoph Hellwig wrote:
> Now that we are using REQ_OP_WRITE_ZEROES for all zeroing needs in the
> kernel there is very little use left for REQ_OP_WRITE_SAME.  We only
> have two callers left, and both just export optional protocol features
> to remote systems: DRBD and the target code.
> 
> Do we have any major users of those?  If not removing it will clean up
> a few warts in the block layer.

I added the iblock write same support because for RBD/ceph we were going
to pass it from rbd down to the OSD. We are using tcmu right now, so we
no longer need iblock support.

^ permalink raw reply

* Re: [PATCH V3 00/16] Introduce the BFQ I/O scheduler
From: Paolo Valente @ 2017-04-11 17:37 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: tj@kernel.org, axboe@kernel.dk, ulf.hansson@linaro.org,
	linux-kernel@vger.kernel.org, fchecconi@gmail.com,
	Arianna Avanzini, linux-block@vger.kernel.org,
	linus.walleij@linaro.org, broonie@kernel.org
In-Reply-To: <1491921434.2540.1.camel@sandisk.com>


> Il giorno 11 apr 2017, alle ore 16:37, Bart Van Assche =
<bart.vanassche@sandisk.com> ha scritto:
>=20
> On Tue, 2017-04-11 at 15:42 +0200, Paolo Valente wrote:
>> new patch series, addressing (both) issues raised by Bart [1].
>=20
> Hello Paolo,
>=20
> Is there a git tree available somewhere with these patches and without
> the single queue BFQ scheduler?
>=20

Just pushed:
https://github.com/Algodev-github/bfq-mq/tree/add-bfq-mq-logical

Thanks,
Paolo

> Thanks,
>=20
> Bart.

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Mike Snitzer @ 2017-04-11 17:47 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, axboe@kernel.dk
In-Reply-To: <1491928005.2654.6.camel@sandisk.com>

On Tue, Apr 11 2017 at 12:26pm -0400,
Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:

> On Tue, 2017-04-11 at 12:09 -0400, Mike Snitzer wrote:
> > This has no place in dm-mq (or any blk-mq
> > driver).  If it is needed it should be elevated to blk-mq core to
> > trigger blk_mq_delay_run_hw_queue() when BLK_MQ_RQ_QUEUE_BUSY is
> > returned from blk_mq_ops' .queue_rq.
> 
> Hello Mike,
> 
> If the blk-mq core would have to figure out whether or not a queue is no
> longer busy without any cooperation from the blk-mq driver all the blk-mq
> core could do is to attempt to rerun that queue from time to time. But at
> which intervals should the blk-mq core check whether or not a queue is still
> busy? Would it be possible to choose intervals at which to check the queue
> state that work well for all block drivers? Consider e.g. at the dm-mpath
> driver. multipath_busy() returns true as long as path initialization is in
> progress. Path initialization can take a long time. The (indirect) call to
> blk_mq_run_queue() from pg_init_done() resumes request processing immediately
> after path initialization has finished. Sorry but I don't think it is possible
> to invent an algorithm for the blk-mq core that guarantees not only that a
> queue is rerun as soon as it is no longer busy but also that avoids that
> plenty of CPU cycles are wasted by the blk-mq core for checking whether a
> queue is no longer busy.

Sorry but that isn't a very strong argument for what you've done.

I mean I do appreciate your point that the 2 BLK_MQ_RQ_QUEUE_BUSY
returns in dm_mq_queue_rq() are not equal but that could easily be
conveyed using a new return value.

Anyway, point is, no blk-mq driver should need to have concern about
whether their request will get resubmitted (and the associated hw queue
re-ran) if they return BLK_MQ_RQ_QUEUE_BUSY.

Your change is a means to an end but it just solves the problem in a
very hackish way.  Other drivers will very likely be caught about by
this blk-mq quirk in the future.

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Bart Van Assche @ 2017-04-11 17:51 UTC (permalink / raw)
  To: snitzer@redhat.com
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, axboe@kernel.dk
In-Reply-To: <20170411174739.GA19620@redhat.com>

On Tue, 2017-04-11 at 13:47 -0400, Mike Snitzer wrote:
> Other drivers will very likely be caught about by
> this blk-mq quirk in the future.

Hello Mike,

Are you aware that the requirement that blk-mq drivers rerun the queue afte=
r
having returned BLK_MQ_RQ_QUEUE_BUSY is a requirement that is shared with
traditional block drivers? From dm_old_request_fn():

	if (... ||=A0(ti->type->busy && ti->type->busy(ti))) {
		blk_delay_queue(q, 10);
		return;
	}

Bart.=

^ permalink raw reply

* Re: [PATCH 1/3] blk-mq: unify hctx delayed_run_work and run_work
From: Bart Van Assche @ 2017-04-11 18:00 UTC (permalink / raw)
  To: linux-block@vger.kernel.org, axboe@fb.com; +Cc: hch@lst.de, osandov@fb.com
In-Reply-To: <1491839696-24783-2-git-send-email-axboe@fb.com>

On Mon, 2017-04-10 at 09:54 -0600, Jens Axboe wrote:
>  void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
>  {
> -	cancel_work(&hctx->run_work);
> +	cancel_delayed_work(&hctx->run_work);
>  	cancel_delayed_work(&hctx->delay_work);
>  	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
>  }

Hello Jens,

I would like to change the above cancel_*work() calls into cancel_*work_syn=
c()
calls because this code is used when e.g. switching between I/O schedulers =
and
no .queue_rq() calls must be ongoing while switching between schedulers. Do=
 you
want to integrate that change into this patch or do you want me to post a
separate patch? In the latter case, should I start from your for-next branc=
h
to develop that patch or from your for-next branch + this patch series?

Thanks,

Bart.=

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Mike Snitzer @ 2017-04-11 18:03 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, axboe@kernel.dk
In-Reply-To: <1491933092.2654.10.camel@sandisk.com>

On Tue, Apr 11 2017 at  1:51pm -0400,
Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:

> On Tue, 2017-04-11 at 13:47 -0400, Mike Snitzer wrote:
> > Other drivers will very likely be caught about by
> > this blk-mq quirk in the future.
> 
> Hello Mike,
> 
> Are you aware that the requirement that blk-mq drivers rerun the queue after
> having returned BLK_MQ_RQ_QUEUE_BUSY is a requirement that is shared with
> traditional block drivers? From dm_old_request_fn():
> 
> 	if (... ||ï¿½(ti->type->busy && ti->type->busy(ti))) {
> 		blk_delay_queue(q, 10);
> 		return;
> 	}

No, and pointing to DM code that does something with the old .request_fn
case to justify why blk-mq requires the same is pretty specious.

Rather than working so hard to use DM code against me, your argument
should be: "blk-mq drivers X, Y and Z rerun the hw queue; this is a well
established pattern"

I see drivers/nvme/host/fc.c:nvme_fc_start_fcp_op() does.  But that is
only one other driver out of ~20 BLK_MQ_RQ_QUEUE_BUSY returns
tree-wide.

Could be there are some others, but hardly a well-established pattern.

^ permalink raw reply

* Re: [PATCH v4 6/6] dm rq: Avoid that request processing stalls sporadically
From: Bart Van Assche @ 2017-04-11 18:18 UTC (permalink / raw)
  To: snitzer@redhat.com
  Cc: linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-block@vger.kernel.org, axboe@kernel.dk
In-Reply-To: <20170411180358.GA19660@redhat.com>

On Tue, 2017-04-11 at 14:03 -0400, Mike Snitzer wrote:
> Rather than working so hard to use DM code against me, your argument
> should be: "blk-mq drivers X, Y and Z rerun the hw queue; this is a well
> established pattern"
>=20
> I see drivers/nvme/host/fc.c:nvme_fc_start_fcp_op() does.  But that is
> only one other driver out of ~20 BLK_MQ_RQ_QUEUE_BUSY returns
> tree-wide.
>=20
> Could be there are some others, but hardly a well-established pattern.

Hello Mike,

Several blk-mq drivers that can return BLK_MQ_RQ_QUEUE_BUSY from their
.queue_rq() implementation stop the request queue=A0(blk_mq_stop_hw_queue()=
)
before returning "busy" and restart the queue after the busy condition has
been cleared (blk_mq_start_stopped_hw_queues()). Examples are virtio_blk an=
d
xen-blkfront. However, this approach is not appropriate for the dm-mq core
nor for the scsi core since both drivers already use the "stopped" state fo=
r
another purpose than tracking whether or not a hardware queue is busy. Henc=
e
the blk_mq_delay_run_hw_queue() and blk_mq_run_hw_queue() calls in these la=
st
two drivers to rerun a hardware queue after the busy state has been cleared=
.

Bart.=

^ permalink raw reply

* Re: [PATCH V3 00/16] Introduce the BFQ I/O scheduler
From: Bart Van Assche @ 2017-04-11 18:31 UTC (permalink / raw)
  To: paolo.valente@linaro.org
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	fchecconi@gmail.com, linus.walleij@linaro.org, axboe@kernel.dk,
	avanzini.arianna@gmail.com, broonie@kernel.org, tj@kernel.org,
	ulf.hansson@linaro.org
In-Reply-To: <17A56264-D096-4F6E-8E66-50030A10B331@linaro.org>

On Tue, 2017-04-11 at 19:37 +0200, Paolo Valente wrote:
> Just pushed:
> https://github.com/Algodev-github/bfq-mq/tree/add-bfq-mq-logical

Thanks!

But are you aware that the code on that branch doesn't build?

$ make all
[ ... ]
ERROR: "bfq_mark_bfqq_busy" [block/bfq-wf2q.ko] undefined!
ERROR: "bfqg_stats_update_dequeue" [block/bfq-wf2q.ko] undefined!
[ ... ]

$ PAGER=3D git grep bfq_mark_bfqq_busy
block/bfq-wf2q.c: =A0=A0=A0=A0=A0=A0bfq_mark_bfqq_busy(bfqq);

Bart.=

^ permalink raw reply

* [PATCH 0/6] blk-mq debugfs patches for kernel v4.12
From: Bart Van Assche @ 2017-04-11 20:58 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Bart Van Assche

Hello Jens,

Please consider the six patches in this series for kernel v4.12. The
first patch in this series is a bug fix for code that has already
been queued for kernel v4.12. The second patch implements a change
requested by Omar. Patches 3-6 are blk-mq debugfs enhancements.

Thanks,

Bart.

Bart Van Assche (6):
  blk-mq: Do not invoke queue operations on a dead queue
  blk-mq: Move the "state" debugfs attribute one level down
  blk-mq: Make blk_flags_show() callers append a newline character
  blk-mq: Show operation, cmd_flags and rq_flags names
  blk-mq: Add blk_mq_ops.show_rq()
  scsi: Implement blk_mq_ops.show_rq()

 block/blk-mq-debugfs.c  | 99 +++++++++++++++++++++++++++++++++++++++++++------
 drivers/scsi/scsi_lib.c | 27 ++++++++++++++
 include/linux/blk-mq.h  |  6 +++
 3 files changed, 120 insertions(+), 12 deletions(-)

-- 
2.12.0

^ permalink raw reply

* [PATCH 1/6] blk-mq: Do not invoke queue operations on a dead queue
From: Bart Van Assche @ 2017-04-11 20:58 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Bart Van Assche, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-1-bart.vanassche@sandisk.com>

The blk-mq debugfs attributes are removed after blk_cleanup_queue()
has finished. Since running a queue after a queue has entered the
"dead" state is not allowed, disallow this. This patch avoids that
an attempt to run a dead queue triggers a kernel crash.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq-debugfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index df9b688b877c..a1ce823578c7 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -111,6 +111,14 @@ static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
 	struct request_queue *q = file_inode(file)->i_private;
 	char op[16] = { }, *s;
 
+	/*
+	 * The debugfs attributes are removed after blk_cleanup_queue() has
+	 * called blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set
+	 * to avoid triggering a use-after-free.
+	 */
+	if (blk_queue_dead(q))
+		return -ENOENT;
+
 	len = min(len, sizeof(op) - 1);
 	if (copy_from_user(op, ubuf, len))
 		return -EFAULT;
-- 
2.12.0

^ permalink raw reply related

* [PATCH 2/6] blk-mq: Move the "state" debugfs attribute one level down
From: Bart Van Assche @ 2017-04-11 20:58 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Bart Van Assche, Omar Sandoval, Hannes Reinecke
In-Reply-To: <20170411205842.28137-1-bart.vanassche@sandisk.com>

Move the "state" attribute from the top level to the "mq" directory
as requested by Omar.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq-debugfs.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index a1ce823578c7..564470d4af52 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -149,11 +149,6 @@ static const struct file_operations blk_queue_flags_fops = {
 	.write		= blk_queue_flags_store,
 };
 
-static const struct blk_mq_debugfs_attr blk_queue_attrs[] = {
-	{"state", 0600, &blk_queue_flags_fops},
-	{},
-};
-
 static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 {
 	if (stat->nr_samples) {
@@ -762,6 +757,7 @@ static const struct file_operations ctx_completed_fops = {
 
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 	{"poll_stat", 0400, &queue_poll_stat_fops},
+	{"state", 0600, &blk_queue_flags_fops},
 	{},
 };
 
@@ -877,9 +873,6 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
 	if (!q->debugfs_dir)
 		return -ENOENT;
 
-	if (!debugfs_create_files(q->debugfs_dir, q, blk_queue_attrs))
-		goto err;
-
 	q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
 	if (!q->mq_debugfs_dir)
 		goto err;
-- 
2.12.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox