Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 15/25] nvme: implement REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

But now for the real NVMe Write Zeroes yet, just to get rid of the
discard abuse for zeroing.  Also rename the quirk flag to be a bit
more self-explanatory.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 10 +++++-----
 drivers/nvme/host/nvme.h |  6 +++---
 drivers/nvme/host/pci.c  |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4a6d7f408769..94b41d847b01 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -335,6 +335,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	case REQ_OP_FLUSH:
 		nvme_setup_flush(ns, cmd);
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		/* currently only aliased to deallocate for a few ctrls: */
 	case REQ_OP_DISCARD:
 		ret = nvme_setup_discard(ns, req, cmd);
 		break;
@@ -900,16 +902,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
 			NVME_DSM_MAX_RANGES);
 
-	if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
-		ns->queue->limits.discard_zeroes_data = 1;
-	else
-		ns->queue->limits.discard_zeroes_data = 0;
-
 	ns->queue->limits.discard_alignment = logical_block_size;
 	ns->queue->limits.discard_granularity = logical_block_size;
 	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
 	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+		blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
 }
 
 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2aa20e3e5675..07ebc4a1c8fc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -68,10 +68,10 @@ enum nvme_quirks {
 	NVME_QUIRK_IDENTIFY_CNS			= (1 << 1),
 
 	/*
-	 * The controller deterministically returns O's on reads to discarded
-	 * logical blocks.
+	 * The controller deterministically returns O's on reads to
+	 * logical blocks that deallocate was called on.
 	 */
-	NVME_QUIRK_DISCARD_ZEROES		= (1 << 2),
+	NVME_QUIRK_DEALLOCATE_ZEROES		= (1 << 2),
 
 	/*
 	 * The controller needs a delay before starts checking the device
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 26a5fd05fe88..0a28787267f0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2135,13 +2135,13 @@ static const struct pci_error_handlers nvme_err_handler = {
 static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_VDEVICE(INTEL, 0x0953),
 		.driver_data = NVME_QUIRK_STRIPE_SIZE |
-				NVME_QUIRK_DISCARD_ZEROES, },
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
 	{ PCI_VDEVICE(INTEL, 0x0a53),
 		.driver_data = NVME_QUIRK_STRIPE_SIZE |
-				NVME_QUIRK_DISCARD_ZEROES, },
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
 	{ PCI_VDEVICE(INTEL, 0x0a54),
 		.driver_data = NVME_QUIRK_STRIPE_SIZE |
-				NVME_QUIRK_DISCARD_ZEROES, },
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
 	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
 	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 14/25] sd: implement unmapping Write Zeroes
From: Christoph Hellwig @ 2017-03-31 16:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Try to use a write same with unmap bit variant if the device supports it
and the caller allows for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index d8d9c0bdd93c..001593ed0444 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -803,6 +803,15 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
 	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
 
+	if (!(rq->cmd_flags & REQ_NOUNMAP)) {
+		switch (sdkp->provisioning_mode) {
+		case SD_LBP_WS16:
+			return sd_setup_write_same16_cmnd(cmd, true);
+		case SD_LBP_WS10:
+			return sd_setup_write_same10_cmnd(cmd, true);
+		}
+	}
+
 	if (sdp->no_write_same)
 		return BLKPREP_INVALID;
 	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 13/25] block_dev: use blkdev_issue_zerout for hole punches
From: Christoph Hellwig @ 2017-03-31 16:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

This gets us support for non-discard efficient write of zeroes (e.g. NVMe)
and prepare for removing the discard_zeroes_data flag.

Also remove a pointless discard support check, which is done in
blkdev_issue_discard already.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/block_dev.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2f704c3a816f..e405d8e58e31 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2069,7 +2069,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 			     loff_t len)
 {
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-	struct request_queue *q = bdev_get_queue(bdev);
 	struct address_space *mapping;
 	loff_t end = start + len - 1;
 	loff_t isize;
@@ -2108,15 +2107,10 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-		/* Only punch if the device can do zeroing discard. */
-		if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
-		error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
-					     GFP_KERNEL, 0);
+		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+					     GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-		if (!blk_queue_discard(q))
-			return -EOPNOTSUPP;
 		error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
 					     GFP_KERNEL, 0);
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 12/25] block: add a new BLKDEV_ZERO_NOFALLBACK flag
From: Christoph Hellwig @ 2017-03-31 16:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout if
the caller doesn't want them.

Also clean up the convoluted check for the return condition that this
new flag is added to.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c        | 5 ++++-
 include/linux/blkdev.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2f6d2cb2e1a2..2f882e22890b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -281,6 +281,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  *
  *  If a device is using logical block provisioning, the underlying space will
  *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ *  If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ *  -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
@@ -298,7 +301,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
 			biop, flags);
-	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+	if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
 		goto out;
 
 	ret = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e7513ce3dbde..a5055d760661 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1351,6 +1351,7 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		struct bio **biop);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
 
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 11/25] block: add a REQ_UNMAP flag for REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

If this flag is set logical provisioning capable device should
release space for the zeroed blocks if possible, if it is not set
devices should keep the blocks anchored.

Also remove an out of sync kerneldoc comment for a static function
that would have become even more out of data with this change.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c           | 19 +++++--------------
 include/linux/blk_types.h |  6 ++++++
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index f9f24ec69c27..2f6d2cb2e1a2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,20 +226,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev:	blockdev to issue
- * @sector:	start sector
- * @nr_sects:	number of sectors to write
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @biop:	pointer to anchor bio
- *
- * Description:
- *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
-		struct bio **biop)
+		struct bio **biop, unsigned flags)
 {
 	struct bio *bio = *biop;
 	unsigned int max_write_zeroes_sectors;
@@ -258,7 +247,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+		bio->bi_opf = REQ_OP_WRITE_ZEROES;
+		if (flags & BLKDEV_ZERO_NOUNMAP)
+			bio->bi_opf |= REQ_NOUNMAP;
 
 		if (nr_sects > max_write_zeroes_sectors) {
 			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -306,7 +297,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
-			biop);
+			biop, flags);
 	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
 		goto out;
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4eae30bfbfca..8eaa7dca7057 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -195,6 +195,10 @@ enum req_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
+
+	/* command specific flags for REQ_OP_WRITE_ZEROES: */
+	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
+
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -212,6 +216,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 
+#define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 10/25] block: add a flags argument to (__)blkdev_issue_zeroout
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Turn the existin discard flag into a new BLKDEV_ZERO_UNMAP flag with
similar semantics, but without referring to diѕcard.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c                    | 31 ++++++++++++++-----------------
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  9 ++++++---
 drivers/nvme/target/io-cmd.c       |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/dax.c                           |  2 +-
 fs/xfs/xfs_bmap_util.c             |  2 +-
 include/linux/blkdev.h             | 16 ++++++++++------
 8 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2a8d638544a7..f9f24ec69c27 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -282,14 +282,18 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @biop:	pointer to anchor bio
- * @discard:	discard flag
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.
+ *
+ *  If a device is using logical block provisioning, the underlying space will
+ *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard)
+		unsigned flags)
 {
 	int ret;
 	int bi_size = 0;
@@ -337,28 +341,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @discard:	whether to discard the block range
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Zero-fill a block range.  If the discard flag is set and the block
- *  device guarantees that subsequent READ operations to the block range
- *  in question will return zeroes, the blocks will be discarded. Should
- *  the discard request fail, if the discard flag is not set, or if
- *  discard_zeroes_data is not supported, this function will resort to
- *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
- *  clearing the block range. Otherwise the zeroing will be performed
- *  using regular WRITE calls.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.  See __blkdev_issue_zeroout() for the
+ *  valid values for %flags.
  */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
 	int ret;
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 
-	if (discard) {
+	if (!(flags & BLKDEV_ZERO_NOUNMAP)) {
 		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
 				BLKDEV_DISCARD_ZERO))
 			return 0;
@@ -366,7 +363,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-			&bio, discard);
+			&bio, flags);
 	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9..8ea00a41be01 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 	truncate_inode_pages_range(mapping, start, end);
 
 	return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
-				    false);
+			BLKDEV_ZERO_NOUNMAP);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692eff..dc9a6dcd431c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1499,19 +1499,22 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u
 		tmp = start + granularity - sector_div(tmp, granularity);
 
 		nr = tmp - start;
-		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start = tmp;
 	}
 	while (nr_sectors >= granularity) {
 		nr = min_t(sector_t, nr_sectors, max_discard_sectors);
-		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start += nr;
 	}
  zero_out:
 	if (nr_sectors) {
-		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 	}
 	return err != 0;
 }
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 4195115c7e54..7206f9cda575 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -184,7 +184,7 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 		(req->ns->blksize_shift - 9)) + 1;
 
 	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-				GFP_KERNEL, &bio, true))
+				GFP_KERNEL, &bio, 0))
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
 
 	if (bio) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f2d59f143ef4..2f704c3a816f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2105,7 +2105,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	case FALLOC_FL_ZERO_RANGE:
 	case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
 		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-					    GFP_KERNEL, false);
+					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
 		/* Only punch if the device can do zeroing discard. */
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..2bfbcd726047 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -982,7 +982,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		sector_t start_sector = dax.sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, true);
+				length >> 9, GFP_NOFS, 0);
 	} else {
 		if (dax_map_atomic(bdev, &dax) < 0)
 			return PTR_ERR(dax.addr);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8b75dcea5966..142bbbe06114 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
 	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
 		block << (mp->m_super->s_blocksize_bits - 9),
 		count_fsb << (mp->m_super->s_blocksize_bits - 9),
-		GFP_NOFS, true);
+		GFP_NOFS, 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a2dc6b390d48..e7513ce3dbde 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1337,23 +1337,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 	return bqt->tag_index[tag];
 }
 
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
 #define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard);
+		unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, bool discard);
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1367,7 +1371,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask, true);
+				    gfp_mask, 0);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 09/25] block: stop using blkdev_issue_write_same for zeroing
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

We'll always use the WRITE ZEROES code for zeroing now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index e5b853f2b8a2..2a8d638544a7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -364,10 +364,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			return 0;
 	}
 
-	if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
-			ZERO_PAGE(0)))
-		return 0;
-
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
 			&bio, discard);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 08/25] dm kcopyd: switch to use REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

It seems like the code currently passes whatever it was using for writes
to WRITE SAME.  Just switch it to WRITE ZEROES, although that doesn't
need any payload.

Untested, and confused by the code, maybe someone who understands it
better than me can help..

Not-yet-signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/dm-kcopyd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9e9d04cb7d51..f85846741d50 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		job->pages = &zero_page_list;
 
 		/*
-		 * Use WRITE SAME to optimize zeroing if all dests support it.
+		 * Use WRITE ZEROES to optimize zeroing if all dests support it.
 		 */
-		job->rw = REQ_OP_WRITE_SAME;
+		job->rw = REQ_OP_WRITE_ZEROES;
 		for (i = 0; i < job->num_dests; i++)
-			if (!bdev_write_same(job->dests[i].bdev)) {
+			if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
 				job->rw = WRITE;
 				break;
 			}
-- 
2.11.0

^ permalink raw reply related

* [PATCH 07/25] dm: support REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Copy & paste from the REQ_OP_WRITE_SAME code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-io.c            |  8 ++++++--
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-mpath.c         |  1 +
 drivers/md/dm-rq.c            | 11 ++++++++---
 drivers/md/dm-stripe.c        |  2 ++
 drivers/md/dm-table.c         | 30 ++++++++++++++++++++++++++++++
 drivers/md/dm.c               | 31 ++++++++++++++++++++++++++++---
 include/linux/device-mapper.h |  6 ++++++
 9 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..fea5bd52ada8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
 void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
 {
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b808cbe22678..3702e502466d 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
 	 */
 	if (op == REQ_OP_DISCARD)
 		special_cmd_max_sectors = q->limits.max_discard_sectors;
+	else if (op == REQ_OP_WRITE_ZEROES)
+		special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
 	else if (op == REQ_OP_WRITE_SAME)
 		special_cmd_max_sectors = q->limits.max_write_same_sectors;
-	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+	     op == REQ_OP_WRITE_SAME)  &&
 	    special_cmd_max_sectors == 0) {
 		dec_count(io, region, -EOPNOTSUPP);
 		return;
@@ -330,6 +333,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		 */
 		switch (op) {
 		case REQ_OP_DISCARD:
+		case REQ_OP_WRITE_ZEROES:
 			num_bvecs = 0;
 			break;
 		case REQ_OP_WRITE_SAME:
@@ -347,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		bio_set_op_attrs(bio, op, op_flags);
 		store_io_and_region_in_bio(bio, io, region);
 
-		if (op == REQ_OP_DISCARD) {
+		if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
 			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 			remaining -= num_sectors;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..e17fd44ceef5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	ti->private = lc;
 	return 0;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f..ab55955ed704 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
 	else
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 28955b94d2b2..6006d5d4b06e 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
-	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-		     !clone->q->limits.max_write_same_sectors))
-		disable_write_same(tio->md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (req_op(clone) == REQ_OP_WRITE_SAME &&
+		    !clone->q->limits.max_write_same_sectors)
+			disable_write_same(tio->md);
+		if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+		    !clone->q->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(tio->md);
+	}
 
 	if (r <= 0)
 		/* The target wants to complete the I/O */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..5ef49c121d99 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = stripes;
 	ti->num_discard_bios = stripes;
 	ti->num_write_same_bios = stripes;
+	ti->num_write_zeroes_bios = stripes;
 
 	sc->chunk_size = chunk_size;
 	if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+	    unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
 	    unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..5cd665c91ead 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1533,6 +1533,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 	return true;
 }
 
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+					   sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_write_zeroes_bios)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+			return false;
+	}
+
+	return true;
+}
+
+
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
@@ -1603,6 +1631,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	if (!dm_table_supports_write_same(t))
 		q->limits.max_write_same_sectors = 0;
+	if (!dm_table_supports_write_zeroes(t))
+		q->limits.max_write_zeroes_sectors = 0;
 
 	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
 		queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..e8226359c8f7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -825,6 +825,14 @@ void disable_write_same(struct mapped_device *md)
 	limits->max_write_same_sectors = 0;
 }
 
+void disable_write_zeroes(struct mapped_device *md)
+{
+	struct queue_limits *limits = dm_get_queue_limits(md);
+
+	/* device doesn't really support WRITE ZEROES, disable it */
+	limits->max_write_zeroes_sectors = 0;
+}
+
 static void clone_endio(struct bio *bio)
 {
 	int error = bio->bi_error;
@@ -851,9 +859,14 @@ static void clone_endio(struct bio *bio)
 		}
 	}
 
-	if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
-		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
-		disable_write_same(md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+			disable_write_same(md);
+		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(md);
+	}
 
 	free_tio(tio);
 	dec_pending(io, error);
@@ -1202,6 +1215,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
 	return ti->num_write_same_bios;
 }
 
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+	return ti->num_write_zeroes_bios;
+}
+
 typedef bool (*is_split_required_fn)(struct dm_target *ti);
 
 static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1256,6 +1274,11 @@ static int __send_write_same(struct clone_info *ci)
 	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_write_zeroes(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1270,6 +1293,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 		return __send_discard(ci);
 	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
 		return __send_write_same(ci);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+		return __send_write_zeroes(ci);
 
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..3829bee2302a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -255,6 +255,12 @@ struct dm_target {
 	unsigned num_write_same_bios;
 
 	/*
+	 * The number of WRITE ZEROES bios that will be submitted to the target.
+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
+	 */
+	unsigned num_write_zeroes_bios;
+
+	/*
 	 * The minimum number of extra bytes allocated in each io for the
 	 * target to use.
 	 */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 06/25] dm io: discards don't take a payload
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Fix up do_region to not allocate a bio_vec for discards.  We've
got rid of the discard payload allocated by the caller years ago.

Obviously this wasn't actually harmful given how long it's been
there, but it's still good to avoid the pointless allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/dm-io.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6c..b808cbe22678 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -328,11 +328,17 @@ static void do_region(int op, int op_flags, unsigned region,
 		/*
 		 * Allocate a suitably sized-bio.
 		 */
-		if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
+		switch (op) {
+		case REQ_OP_DISCARD:
+			num_bvecs = 0;
+			break;
+		case REQ_OP_WRITE_SAME:
 			num_bvecs = 1;
-		else
+			break;
+		default:
 			num_bvecs = min_t(int, BIO_MAX_PAGES,
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+		}
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 05/25] md: support REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Copy & paste from the REQ_OP_WRITE_SAME code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/linear.c    | 1 +
 drivers/md/md.h        | 7 +++++++
 drivers/md/multipath.c | 1 +
 drivers/md/raid0.c     | 2 ++
 drivers/md/raid1.c     | 4 +++-
 drivers/md/raid10.c    | 1 +
 drivers/md/raid5.c     | 1 +
 7 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3e..377a8a3672e3 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -293,6 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
 			mddev_check_writesame(mddev, split);
+			mddev_check_write_zeroes(mddev, split);
 			generic_make_request(split);
 		}
 	} while (split != bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index dde8ecb760c8..1e76d64ce180 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -709,4 +709,11 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
+{
+	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+		mddev->queue->limits.max_write_zeroes_sectors = 0;
+}
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 79a12b59250b..e95d521d93e9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -139,6 +139,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
 	mddev_check_writesame(mddev, &mp_bh->bio);
+	mddev_check_write_zeroes(mddev, &mp_bh->bio);
 	generic_make_request(&mp_bh->bio);
 	return;
 }
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 93347ca7c7a6..ce7a6a56cf73 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -383,6 +383,7 @@ static int raid0_run(struct mddev *mddev)
 
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
 
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
@@ -504,6 +505,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
 			mddev_check_writesame(mddev, split);
+			mddev_check_write_zeroes(mddev, split);
 			generic_make_request(split);
 		}
 	} while (split != bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a34f58772022..b59cc100320a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3177,8 +3177,10 @@ static int raid1_run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
-	if (mddev->queue)
+	if (mddev->queue) {
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
+		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
+	}
 
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e89a8d78a9ed..28ec3a93acee 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3749,6 +3749,7 @@ static int raid10_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
+		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..8cf1f86dcd05 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7272,6 +7272,7 @@ static int raid5_run(struct mddev *mddev)
 		mddev->queue->limits.discard_zeroes_data = 0;
 
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
+		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
 		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 04/25] sd: implement REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c     | 31 ++++++++++++++++++++++++++-----
 drivers/scsi/sd_zbc.c |  1 +
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index b853f91fb3da..d8d9c0bdd93c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -735,7 +735,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -752,13 +752,14 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
 
 	cmd->cmd_len = 16;
 	cmd->cmnd[0] = WRITE_SAME_16;
-	cmd->cmnd[1] = 0x8; /* UNMAP */
+	if (unmap)
+		cmd->cmnd[1] = 0x8; /* UNMAP */
 	put_unaligned_be64(sector, &cmd->cmnd[2]);
 	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
 
 	cmd->allowed = SD_MAX_RETRIES;
 	cmd->transfersize = data_len;
-	rq->timeout = SD_TIMEOUT;
+	rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
 	scsi_req(rq)->resid_len = data_len;
 
 	return scsi_init_io(cmd);
@@ -788,12 +789,27 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	cmd->allowed = SD_MAX_RETRIES;
 	cmd->transfersize = data_len;
-	rq->timeout = SD_TIMEOUT;
+	rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
 	scsi_req(rq)->resid_len = data_len;
 
 	return scsi_init_io(cmd);
 }
 
+static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+{
+	struct request *rq = cmd->request;
+	struct scsi_device *sdp = cmd->device;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+
+	if (sdp->no_write_same)
+		return BLKPREP_INVALID;
+	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
+		return sd_setup_write_same16_cmnd(cmd, false);
+	return sd_setup_write_same10_cmnd(cmd, false);
+}
+
 static void sd_config_write_same(struct scsi_disk *sdkp)
 {
 	struct request_queue *q = sdkp->disk->queue;
@@ -823,6 +839,8 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
 out:
 	blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
 					 (logical_block_size >> 9));
+	blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
+					 (logical_block_size >> 9));
 }
 
 /**
@@ -1163,7 +1181,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		case SD_LBP_UNMAP:
 			return sd_setup_unmap_cmnd(cmd);
 		case SD_LBP_WS16:
-			return sd_setup_write_same16_cmnd(cmd);
+			return sd_setup_write_same16_cmnd(cmd, true);
 		case SD_LBP_WS10:
 			return sd_setup_write_same10_cmnd(cmd, true);
 		case SD_LBP_ZERO:
@@ -1171,6 +1189,8 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		default:
 			return BLKPREP_INVALID;
 		}
+	case REQ_OP_WRITE_ZEROES:
+		return sd_setup_write_zeroes_cmnd(cmd);
 	case REQ_OP_WRITE_SAME:
 		return sd_setup_write_same_cmnd(cmd);
 	case REQ_OP_FLUSH:
@@ -1810,6 +1830,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 
 	switch (req_op(req)) {
 	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET:
 		if (!result) {
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 92620c8ea8ad..1994f7799fce 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -329,6 +329,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 
 	switch (req_op(rq)) {
 	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET:
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 03/25] block: implement splitting of REQ_OP_WRITE_ZEROES bios
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Copy and past the REQ_OP_WRITE_SAME code to prepare to implementations
that limit the write zeroes size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-merge.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d1..3990ae406341 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
+		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+{
+	*nsegs = 1;
+
+	if (!q->limits.max_write_zeroes_sectors)
+		return NULL;
+
+	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+		return NULL;
+
+	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+}
+
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
 					    struct bio_set *bs,
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = NULL;
-		nsegs = (*bio)->bi_phys_segments;
+		split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 02/25] block: renumber REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Make life easy for implementations that needs to send a data buffer
to the device (e.g. SCSI) by numbering it as a data out command.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 67bcf8a5326e..4eae30bfbfca 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -168,7 +168,7 @@ enum req_opf {
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
 	/* write the zero filled sector many times */
-	REQ_OP_WRITE_ZEROES	= 8,
+	REQ_OP_WRITE_ZEROES	= 9,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 01/25] ѕd: split sd_setup_discard_cmnd
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170331163313.31821-1-hch@lst.de>

Split sd_setup_discard_cmnd into one function per provisioning type.  While
this creates some very slight duplication of boilerplate code it keeps the
code modular for additions of new provisioning types, and for reusing the
write same functions for the upcoming scsi implementation of the Write Zeroes
operation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c | 153 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 84 insertions(+), 69 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fcfeddc79331..b853f91fb3da 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -701,93 +701,97 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
-/**
- * sd_setup_discard_cmnd - unmap blocks on thinly provisioned device
- * @sdp: scsi device to operate on
- * @rq: Request to prepare
- *
- * Will issue either UNMAP or WRITE SAME(16) depending on preference
- * indicated by target device.
- **/
-static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 {
-	struct request *rq = cmd->request;
 	struct scsi_device *sdp = cmd->device;
-	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
-	sector_t sector = blk_rq_pos(rq);
-	unsigned int nr_sectors = blk_rq_sectors(rq);
-	unsigned int len;
-	int ret;
+	struct request *rq = cmd->request;
+	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+	unsigned int data_len = 24;
 	char *buf;
-	struct page *page;
 
-	sector >>= ilog2(sdp->sector_size) - 9;
-	nr_sectors >>= ilog2(sdp->sector_size) - 9;
-
-	page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
-	if (!page)
+	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!rq->special_vec.bv_page)
 		return BLKPREP_DEFER;
+	rq->special_vec.bv_offset = 0;
+	rq->special_vec.bv_len = data_len;
+	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
-	switch (sdkp->provisioning_mode) {
-	case SD_LBP_UNMAP:
-		buf = page_address(page);
-
-		cmd->cmd_len = 10;
-		cmd->cmnd[0] = UNMAP;
-		cmd->cmnd[8] = 24;
-
-		put_unaligned_be16(6 + 16, &buf[0]);
-		put_unaligned_be16(16, &buf[2]);
-		put_unaligned_be64(sector, &buf[8]);
-		put_unaligned_be32(nr_sectors, &buf[16]);
+	cmd->cmd_len = 10;
+	cmd->cmnd[0] = UNMAP;
+	cmd->cmnd[8] = 24;
 
-		len = 24;
-		break;
+	buf = page_address(rq->special_vec.bv_page);
+	put_unaligned_be16(6 + 16, &buf[0]);
+	put_unaligned_be16(16, &buf[2]);
+	put_unaligned_be64(sector, &buf[8]);
+	put_unaligned_be32(nr_sectors, &buf[16]);
 
-	case SD_LBP_WS16:
-		cmd->cmd_len = 16;
-		cmd->cmnd[0] = WRITE_SAME_16;
-		cmd->cmnd[1] = 0x8; /* UNMAP */
-		put_unaligned_be64(sector, &cmd->cmnd[2]);
-		put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
+	cmd->allowed = SD_MAX_RETRIES;
+	cmd->transfersize = data_len;
+	rq->timeout = SD_TIMEOUT;
+	scsi_req(rq)->resid_len = data_len;
 
-		len = sdkp->device->sector_size;
-		break;
+	return scsi_init_io(cmd);
+}
 
-	case SD_LBP_WS10:
-	case SD_LBP_ZERO:
-		cmd->cmd_len = 10;
-		cmd->cmnd[0] = WRITE_SAME;
-		if (sdkp->provisioning_mode == SD_LBP_WS10)
-			cmd->cmnd[1] = 0x8; /* UNMAP */
-		put_unaligned_be32(sector, &cmd->cmnd[2]);
-		put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
+{
+	struct scsi_device *sdp = cmd->device;
+	struct request *rq = cmd->request;
+	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 data_len = sdp->sector_size;
 
-		len = sdkp->device->sector_size;
-		break;
+	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!rq->special_vec.bv_page)
+		return BLKPREP_DEFER;
+	rq->special_vec.bv_offset = 0;
+	rq->special_vec.bv_len = data_len;
+	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
-	default:
-		ret = BLKPREP_INVALID;
-		goto out;
-	}
+	cmd->cmd_len = 16;
+	cmd->cmnd[0] = WRITE_SAME_16;
+	cmd->cmnd[1] = 0x8; /* UNMAP */
+	put_unaligned_be64(sector, &cmd->cmnd[2]);
+	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
 
+	cmd->allowed = SD_MAX_RETRIES;
+	cmd->transfersize = data_len;
 	rq->timeout = SD_TIMEOUT;
+	scsi_req(rq)->resid_len = data_len;
 
-	cmd->transfersize = len;
-	cmd->allowed = SD_MAX_RETRIES;
+	return scsi_init_io(cmd);
+}
 
-	rq->special_vec.bv_page = page;
-	rq->special_vec.bv_offset = 0;
-	rq->special_vec.bv_len = len;
+static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+	struct scsi_device *sdp = cmd->device;
+	struct request *rq = cmd->request;
+	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 data_len = sdp->sector_size;
 
+	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!rq->special_vec.bv_page)
+		return BLKPREP_DEFER;
+	rq->special_vec.bv_offset = 0;
+	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
-	scsi_req(rq)->resid_len = len;
 
-	ret = scsi_init_io(cmd);
-out:
-	if (ret != BLKPREP_OK)
-		__free_page(page);
-	return ret;
+	cmd->cmd_len = 10;
+	cmd->cmnd[0] = WRITE_SAME;
+	if (unmap)
+		cmd->cmnd[1] = 0x8; /* UNMAP */
+	put_unaligned_be32(sector, &cmd->cmnd[2]);
+	put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
+
+	cmd->allowed = SD_MAX_RETRIES;
+	cmd->transfersize = data_len;
+	rq->timeout = SD_TIMEOUT;
+	scsi_req(rq)->resid_len = data_len;
+
+	return scsi_init_io(cmd);
 }
 
 static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -1155,7 +1159,18 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 
 	switch (req_op(rq)) {
 	case REQ_OP_DISCARD:
-		return sd_setup_discard_cmnd(cmd);
+		switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
+		case SD_LBP_UNMAP:
+			return sd_setup_unmap_cmnd(cmd);
+		case SD_LBP_WS16:
+			return sd_setup_write_same16_cmnd(cmd);
+		case SD_LBP_WS10:
+			return sd_setup_write_same10_cmnd(cmd, true);
+		case SD_LBP_ZERO:
+			return sd_setup_write_same10_cmnd(cmd, false);
+		default:
+			return BLKPREP_INVALID;
+		}
 	case REQ_OP_WRITE_SAME:
 		return sd_setup_write_same_cmnd(cmd);
 	case REQ_OP_FLUSH:
-- 
2.11.0

^ permalink raw reply related

* always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Christoph Hellwig @ 2017-03-31 16:32 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid

This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
supported by the block layer, and switches existing implementations
of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
removes incorrect discard_zeroes_data, and also switches WRITE SAME
based zeroing in SCSI to this new method.

The series is against the block for-next tree.

A git tree is also avaiable at:

    git://git.infradead.org/users/hch/block.git discard-rework

Gitweb:

    http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/discard-rework

^ permalink raw reply

* Re: [PATCH V2 04/16] block, bfq: modify the peak-rate estimator
From: Bart Van Assche @ 2017-03-31 15:31 UTC (permalink / raw)
  To: tj@kernel.org, paolo.valente@linaro.org, axboe@kernel.dk
  Cc: ulf.hansson@linaro.org, linux-kernel@vger.kernel.org,
	fchecconi@gmail.com, avanzini.arianna@gmail.com,
	linux-block@vger.kernel.org, linus.walleij@linaro.org,
	broonie@kernel.org
In-Reply-To: <20170331124743.3530-5-paolo.valente@linaro.org>

On Fri, 2017-03-31 at 14:47 +0200, Paolo Valente wrote:
> -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue=
 *bfqq,
> -=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=
=A0=A0=A0=A0=A0=A0=A0 bool compensate)
> +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bf=
qq,
> +=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=
=A0=A0=A0=A0=A0=A0=A0 bool compensate, enum bfqq_expiration reason,
> +=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=
=A0=A0=A0=A0=A0=A0=A0 unsigned long *delta_ms)
> =A0{
> -=A0=A0=A0=A0=A0=A0=A0u64 bw, usecs, expected, timeout;
> -=A0=A0=A0=A0=A0=A0=A0ktime_t delta;
> -=A0=A0=A0=A0=A0=A0=A0int update =3D 0;
> +=A0=A0=A0=A0=A0=A0=A0ktime_t delta_ktime;
> +=A0=A0=A0=A0=A0=A0=A0u32 delta_usecs;
> +=A0=A0=A0=A0=A0=A0=A0bool slow =3D BFQQ_SEEKY(bfqq); /* if delta too sho=
rt, use seekyness */
> =A0
> -=A0=A0=A0=A0=A0=A0=A0if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfq=
q))
> +=A0=A0=A0=A0=A0=A0=A0if (!bfq_bfqq_sync(bfqq))
> =A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0return false;
> =A0
> =A0=A0=A0=A0=A0=A0=A0=A0if (compensate)
> -=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0delta =3D bfqd->last_idling=
_start;
> +=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0delta_ktime =3D bfqd->last_=
idling_start;
> =A0=A0=A0=A0=A0=A0=A0=A0else
> -=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0delta =3D ktime_get();
> -=A0=A0=A0=A0=A0=A0=A0delta =3D ktime_sub(delta, bfqd->last_budget_start)=
;
> -=A0=A0=A0=A0=A0=A0=A0usecs =3D ktime_to_us(delta);
> -
> -=A0=A0=A0=A0=A0=A0=A0/* Don't trust short/unrealistic values. */
> -=A0=A0=A0=A0=A0=A0=A0if (usecs < 100 || usecs >=3D LONG_MAX)
> -=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0return false;
> -
> -=A0=A0=A0=A0=A0=A0=A0/*
> -=A0=A0=A0=A0=A0=A0=A0 * Calculate the bandwidth for the last slice.=A0 W=
e use a 64 bit
> -=A0=A0=A0=A0=A0=A0=A0 * value to store the peak rate, in sectors per use=
c in fixed
> -=A0=A0=A0=A0=A0=A0=A0 * point math.=A0 We do so to have enough precision=
 in the estimate
> -=A0=A0=A0=A0=A0=A0=A0 * and to avoid overflows.
> -=A0=A0=A0=A0=A0=A0=A0 */
> -=A0=A0=A0=A0=A0=A0=A0bw =3D (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
> -=A0=A0=A0=A0=A0=A0=A0do_div(bw, (unsigned long)usecs);
> +=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0delta_ktime =3D ktime_get()=
;
> +=A0=A0=A0=A0=A0=A0=A0delta_ktime =3D ktime_sub(delta_ktime, bfqd->last_b=
udget_start);
> +=A0=A0=A0=A0=A0=A0=A0delta_usecs =3D ktime_to_us(delta_ktime);
> +

This patch changes the type of the variable in which the result of ktime_to=
_us()
is stored from u64 into u32 and next compares that result with LONG_MAX. Si=
nce
ktime_to_us() returns a signed 64-bit number, are you sure you want to stor=
e that
result in a 32-bit variable? If ktime_to_us() would e.g. return 0xffffffff0=
0000100
or 0x100000100 then the assignment will truncate these numbers to 0x100.

Bart.=

^ permalink raw reply

* Re: [PATCH V2 11/16] block, bfq: reduce idling only in symmetric scenarios
From: Bart Van Assche @ 2017-03-31 15:20 UTC (permalink / raw)
  To: tj@kernel.org, paolo.valente@linaro.org, axboe@kernel.dk
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	samuele.zecchini92@gmail.com, fchecconi@gmail.com,
	linus.walleij@linaro.org, avanzini.arianna@gmail.com,
	broonie@kernel.org, riccardo.pizzetti@gmail.com,
	ulf.hansson@linaro.org
In-Reply-To: <20170331124743.3530-12-paolo.valente@linaro.org>

On Fri, 2017-03-31 at 14:47 +0200, Paolo Valente wrote:
> +=A0=A0=A0=A0=A0=A0=A0entity->weight_counter =3D kzalloc(sizeof(struct bf=
q_weight_counter),
> +=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=
=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 GFP_ATOMIC);
> +=A0=A0=A0=A0=A0=A0=A0entity->weight_counter->weight =3D entity->weight;

GFP_ATOMIC allocations are more likely to fail than GFP_KERNEL allocations.
What will happen if kzalloc() returns NULL?

Bart.=

^ permalink raw reply

* Re: [PATCH RFC 00/14] Add the BFQ I/O Scheduler to blk-mq
From: Paolo Valente @ 2017-03-31 13:27 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Jens Axboe, Tejun Heo, Fabio Checconi, Arianna Avanzini,
	linux-block, Linux-Kernal, Ulf Hansson, Linus Walleij, broonie
In-Reply-To: <20170306074321.GA290@x4>


> Il giorno 06 mar 2017, alle ore 08:43, Markus Trippelsdorf =
<markus@trippelsdorf.de> ha scritto:
>=20
> On 2017.03.04 at 17:01 +0100, Paolo Valente wrote:
>> Hi,
>> at last, here is my first patch series meant for merging. It adds BFQ
>> to blk-mq. Don't worry, in this message I won't bore you again with
>> the wonderful properties of BFQ :)
>=20
> I gave BFQ a quick try. Unfortunately it hangs when I try to delete
> btrfs snapshots:
>=20
> root       124  0.0  0.0      0     0 ?        D    07:19   0:03 =
[btrfs-cleaner] =20
> root       125  0.0  0.0      0     0 ?        D    07:19   0:00 =
[btrfs-transacti]
>=20
> [ 4372.880116] sysrq: SysRq : Show Blocked State
> [ 4372.880125]   task                        PC stack   pid father
> [ 4372.880148] btrfs-cleaner   D    0   124      2 0x00000000
> [ 4372.880156] Call Trace:
> [ 4372.880166]  ? __schedule+0x160/0x7c0
> [ 4372.880174]  ? io_schedule+0x64/0xe0
> [ 4372.880179]  ? wait_on_page_bit+0x7a/0x100
> [ 4372.880183]  ? devm_memunmap+0x40/0x40
> [ 4372.880189]  ? read_extent_buffer_pages+0x25c/0x2c0
> [ 4372.880195]  ? run_one_async_done+0xc0/0xc0
> [ 4372.880200]  ? btree_read_extent_buffer_pages+0x60/0x2e0
> [ 4372.880206]  ? read_tree_block+0x2c/0x60
> [ 4372.880211]  ? read_block_for_search.isra.38+0xec/0x3a0
> [ 4372.880217]  ? btrfs_search_slot+0x214/0xbc0
> [ 4372.880221]  ? lookup_inline_extent_backref+0xfb/0x8c0
> [ 4372.880225]  ? __btrfs_free_extent.isra.74+0xe9/0xdc0
> [ 4372.880231]  ? btrfs_merge_delayed_refs+0x57/0x6e0
> [ 4372.880235]  ? __btrfs_run_delayed_refs+0x60d/0x1340
> [ 4372.880239]  ? btrfs_run_delayed_refs+0x64/0x280
> [ 4372.880243]  ? btrfs_should_end_transaction+0x3b/0xa0
> [ 4372.880247]  ? btrfs_drop_snapshot+0x3b2/0x800
> [ 4372.880251]  ? __schedule+0x168/0x7c0
> [ 4372.880254]  ? btrfs_clean_one_deleted_snapshot+0xa4/0xe0
> [ 4372.880259]  ? cleaner_kthread+0x13a/0x180
> [ 4372.880264]  ? btree_invalidatepage+0xc0/0xc0
> [ 4372.880268]  ? kthread+0x144/0x180
> [ 4372.880272]  ? kthread_flush_work_fn+0x20/0x20
> [ 4372.880277]  ? ret_from_fork+0x23/0x30
> [ 4372.880280] btrfs-transacti D    0   125      2 0x00000000
> [ 4372.880285] Call Trace:
> [ 4372.880290]  ? __schedule+0x160/0x7c0
> [ 4372.880295]  ? io_schedule+0x64/0xe0
> [ 4372.880300]  ? wait_on_page_bit_common.constprop.57+0x160/0x180
> [ 4372.880303]  ? devm_memunmap+0x40/0x40
> [ 4372.880307]  ? __filemap_fdatawait_range+0xd3/0x140
> [ 4372.880311]  ? clear_state_bit.constprop.82+0xf7/0x180
> [ 4372.880315]  ? __clear_extent_bit.constprop.79+0x138/0x3c0
> [ 4372.880319]  ? filemap_fdatawait_range+0x9/0x60
> [ 4372.880323]  ? __btrfs_wait_marked_extents.isra.18+0xc1/0x100
> [ 4372.880327]  ? =
btrfs_write_and_wait_marked_extents.constprop.23+0x49/0x80
> [ 4372.880331]  ? btrfs_commit_transaction+0x8e1/0xb00
> [ 4372.880334]  ? join_transaction.constprop.24+0x10/0xa0
> [ 4372.880340]  ? wake_bit_function+0x60/0x60
> [ 4372.880345]  ? transaction_kthread+0x185/0x1a0
> [ 4372.880350]  ? btrfs_cleanup_transaction+0x500/0x500
> [ 4372.880354]  ? kthread+0x144/0x180
> [ 4372.880358]  ? kthread_flush_work_fn+0x20/0x20
> [ 4372.880362]  ? ret_from_fork+0x23/0x30
> [ 4372.880367] ntpd            D    0   175      1 0x00000004
> [ 4372.880372] Call Trace:
> [ 4372.880375]  ? __schedule+0x160/0x7c0
> [ 4372.880379]  ? schedule_preempt_disabled+0x2d/0x80
> [ 4372.880383]  ? __mutex_lock.isra.5+0x17b/0x4c0
> [ 4372.880386]  ? wait_current_trans+0x15/0xc0
> [ 4372.880391]  ? btrfs_free_path+0xe/0x20
> [ 4372.880395]  ? btrfs_pin_log_trans+0x14/0x40
> [ 4372.880400]  ? btrfs_rename2+0x28e/0x19c0
> [ 4372.880404]  ? path_init+0x187/0x3e0
> [ 4372.880407]  ? unlazy_walk+0x4b/0x100
> [ 4372.880410]  ? terminate_walk+0x8d/0x100
> [ 4372.880414]  ? filename_parentat+0x1e9/0x2c0
> [ 4372.880420]  ? __kmalloc_track_caller+0xc4/0x100
> [ 4372.880424]  ? vfs_rename+0x33f/0x7e0
> [ 4372.880428]  ? SYSC_renameat2+0x53c/0x680
> [ 4372.880433]  ? entry_SYSCALL_64_fastpath+0x13/0x94
> [ 4372.880437] fcron           D    0   178      1 0x00000000
> [ 4372.880441] Call Trace:
> [ 4372.880445]  ? __schedule+0x160/0x7c0
> [ 4372.880448]  ? schedule_preempt_disabled+0x2d/0x80
> [ 4372.880452]  ? __mutex_lock.isra.5+0x17b/0x4c0
> [ 4372.880458]  ? pagevec_lookup_tag+0x18/0x20
> [ 4372.880462]  ? btrfs_log_dentry_safe+0x4cd/0xac0
> [ 4372.880466]  ? btrfs_start_transaction+0x249/0x460
> [ 4372.880470]  ? btrfs_sync_file+0x288/0x3c0
> [ 4372.880475]  ? btrfs_file_write_iter+0x3a9/0x4e0
> [ 4372.880479]  ? vfs_write+0x26c/0x2c0
> [ 4372.880483]  ? SyS_write+0x3d/0xa0
> [ 4372.880486]  ? SyS_fchown+0x7b/0xa0
> [ 4372.880491]  ? entry_SYSCALL_64_fastpath+0x13/0x94
> [ 4372.880508] kworker/u8:8    D    0   759      2 0x00000000
> [ 4372.880518] Workqueue: btrfs-submit btrfs_submit_helper
> [ 4372.880520] Call Trace:
> [ 4372.880524]  ? __schedule+0x160/0x7c0
> [ 4372.880529]  ? io_schedule+0x64/0xe0
> [ 4372.880534]  ? blk_mq_get_tag+0x212/0x320
> [ 4372.880538]  ? wake_bit_function+0x60/0x60
> [ 4372.880544]  ? __blk_mq_alloc_request+0x11/0x1c0
> [ 4372.880548]  ? blk_mq_sched_get_request+0x17e/0x220
> [ 4372.880553]  ? blk_sq_make_request+0xd3/0x4c0
> [ 4372.880557]  ? blk_mq_sched_dispatch_requests+0x104/0x160
> [ 4372.880561]  ? generic_make_request+0xc3/0x2e0
> [ 4372.880564]  ? submit_bio+0x58/0x100
> [ 4372.880569]  ? run_scheduled_bios+0x1a6/0x500
> [ 4372.880574]  ? btrfs_worker_helper+0x129/0x1c0
> [ 4372.880580]  ? process_one_work+0x1bc/0x400
> [ 4372.880585]  ? worker_thread+0x42/0x540
> [ 4372.880588]  ? __schedule+0x168/0x7c0
> [ 4372.880592]  ? process_one_work+0x400/0x400
> [ 4372.880596]  ? kthread+0x144/0x180
> [ 4372.880600]  ? kthread_flush_work_fn+0x20/0x20
> [ 4372.880605]  ? ret_from_fork+0x23/0x30
>=20
> I could get it going again by running:
> echo "mq-deadline" > /sys/block/sdb/queue/scheduler
>=20

Hi Markus,
thanks for testing BFQ.  And sorry for replying only now.  Before
replying I have tried to reproduce the failure, or to understand
somehow the link between the failure and BFQ (unfortunately, no BFQ
function is preset in your stack trace). Unfortunately at no avail.

I hope that your failure was caused by one of the bugs that I have
fixed from the previous submission.  So, if you could try the new
patch series [1] when you have time to, I would really appreciate
that.

Thanks,
Paolo

[1] https://lkml.org/lkml/2017/3/31/393

> --
> Markus

^ permalink raw reply

* [PATCH V2 16/16] block, bfq: split bfq-iosched.c into multiple source files
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

The BFQ I/O scheduler features an optimal fair-queuing
(proportional-share) scheduling algorithm, enriched with several
mechanisms to boost throughput and reduce latency for interactive and
real-time applications. This makes BFQ a large and complex piece of
code. This commit addresses this issue by splitting BFQ into three
main, independent components, and by moving each component into a
separate source file:
1. Main algorithm: handles the interaction with the kernel, and
decides which requests to dispatch; it uses the following two further
components to achieve its goals.
2. Scheduling engine (Hierarchical B-WF2Q+ scheduling algorithm):
computes the schedule, using weights and budgets provided by the above
component.
3. cgroups support: handles group operations (creation, destruction,
move, ...).

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
---
 block/Makefile      |    2 +-
 block/bfq-cgroup.c  | 1139 +++++++++++++++
 block/bfq-iosched.c | 3925 +++------------------------------------------------
 block/bfq-iosched.h |  942 +++++++++++++
 block/bfq-wf2q.c    | 1616 +++++++++++++++++++++
 5 files changed, 3868 insertions(+), 3756 deletions(-)
 create mode 100644 block/bfq-cgroup.c
 create mode 100644 block/bfq-iosched.h
 create mode 100644 block/bfq-wf2q.c

diff --git a/block/Makefile b/block/Makefile
index 91869f2..546066e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
-obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000..c8a32fb
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1139 @@
+/*
+ * cgroups support for the BFQ I/O scheduler.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "bfq-iosched.h"
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+			      unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+				  uint64_t io_start_time, unsigned int op)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, op,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples,
+			  &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity; /* NULL for root group */
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), gfp);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
+void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
+}
+
+struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
+	bfqg->rq_pos_tree = RB_ROOT;
+}
+
+void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	return kfree(bfqg);
+}
+
+void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+				     struct blkcg *blkcg)
+{
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		return NULL;
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		   struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQQE_PREEMPTED);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+	else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	if (bfq_bfqq_busy(bfqq)) {
+		bfq_pos_tree_add_move(bfqd, bfqq);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq, async_bfqq->ref);
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+	}
+
+	return bfqg;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_group *bfqg = NULL;
+	uint64_t serial_nr;
+
+	rcu_read_lock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
+{
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
+
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
+
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
+
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct bfq_data *bfqd = bfqg->bfqd;
+	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long flags;
+	int i;
+
+	if (!entity) /* root group */
+		return;
+
+	spin_lock_irqsave(&bfqd->lock, flags);
+	/*
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
+	 */
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		st = bfqg->sched_data.service_tree + i;
+
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
+	}
+
+	__bfq_deactivate_entity(entity, false);
+	bfq_put_async_queues(bfqd, bfqg);
+
+	spin_unlock_irqrestore(&bfqd->lock, flags);
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	bfqg_stats_xfer_dead(bfqg);
+}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		bfq_end_wr_async_queues(bfqd, bfqg);
+	}
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (bfqgd)
+		val = bfqgd->weight;
+
+	seq_printf(sf, "%u\n", val);
+
+	return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -ERANGE;
+
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
+
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+	return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
+{
+	u64 weight;
+	/* First unsigned long found in the file is used */
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
+}
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+	return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+
+struct cftype bfq_blkcg_legacy_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
+	},
+
+	/* statistics, covers only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.seq_show = bfqg_print_stat_sectors,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.seq_show = bfqg_print_stat_sectors_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
+};
+
+#else	/* CONFIG_BFQ_GROUP_IOSCHED */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+			      unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+				  uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		   struct bfq_group *bfqg) {}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
+}
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 831b406..88f07d6 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -102,3765 +102,201 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
-#include <linux/blktrace_api.h>
-#include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
+#include "bfq-iosched.h"
 
-#define BFQ_IOPRIO_CLASSES	3
-#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
-
-#define BFQ_MIN_WEIGHT			1
-#define BFQ_MAX_WEIGHT			1000
-#define BFQ_WEIGHT_CONVERSION_COEFF	10
-
-#define BFQ_DEFAULT_QUEUE_IOPRIO	4
-
-#define BFQ_WEIGHT_LEGACY_DFL	100
-#define BFQ_DEFAULT_GRP_IOPRIO	0
-#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
-
-/*
- * Soft real-time applications are extremely more latency sensitive
- * than interactive ones. Over-raise the weight of the former to
- * privilege them against the latter.
- */
-#define BFQ_SOFTRT_WEIGHT_FACTOR	100
-
-struct bfq_entity;
-
-/**
- * struct bfq_service_tree - per ioprio_class service tree.
- *
- * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
- * ioprio_class has its own independent scheduler, and so its own
- * bfq_service_tree.  All the fields are protected by the queue lock
- * of the containing bfqd.
- */
-struct bfq_service_tree {
-	/* tree for active entities (i.e., those backlogged) */
-	struct rb_root active;
-	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
-	struct rb_root idle;
-
-	/* idle entity with minimum F_i */
-	struct bfq_entity *first_idle;
-	/* idle entity with maximum F_i */
-	struct bfq_entity *last_idle;
-
-	/* scheduler virtual time */
-	u64 vtime;
-	/* scheduler weight sum; active and idle entities contribute to it */
-	unsigned long wsum;
-};
-
-/**
- * struct bfq_sched_data - multi-class scheduler.
- *
- * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as an
- * intermediate queue on a hierarchical setup.  @next_in_service
- * points to the active entity of the sched_data service trees that
- * will be scheduled next. It is used to reduce the number of steps
- * needed for each hierarchical-schedule update.
- *
- * The supported ioprio_classes are the same as in CFQ, in descending
- * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
- * Requests from higher priority queues are served before all the
- * requests from lower priority queues; among requests of the same
- * queue requests are served according to B-WF2Q+.
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_sched_data {
-	/* entity in service */
-	struct bfq_entity *in_service_entity;
-	/* head-of-line entity (see comments above) */
-	struct bfq_entity *next_in_service;
-	/* array of service trees, one per ioprio_class */
-	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
-	/* last time CLASS_IDLE was served */
-	unsigned long bfq_class_idle_last_service;
-
-};
-
-/**
- * struct bfq_weight_counter - counter of the number of all active entities
- *                             with a given weight.
- */
-struct bfq_weight_counter {
-	unsigned int weight; /* weight of the entities this counter refers to */
-	unsigned int num_active; /* nr of active entities with this weight */
-	/*
-	 * Weights tree member (see bfq_data's @queue_weights_tree and
-	 * @group_weights_tree)
-	 */
-	struct rb_node weights_node;
-};
-
-/**
- * struct bfq_entity - schedulable entity.
- *
- * A bfq_entity is used to represent either a bfq_queue (leaf node in the
- * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
- * entity belongs to the sched_data of the parent group in the cgroup
- * hierarchy.  Non-leaf entities have also their own sched_data, stored
- * in @my_sched_data.
- *
- * Each entity stores independently its priority values; this would
- * allow different weights on different devices, but this
- * functionality is not exported to userspace by now.  Priorities and
- * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @prio_changed flag.  As soon as
- * there is a transition in the entity state that allows the priority
- * update to take place the effective and the requested priority
- * values are synchronized.
- *
- * Unless cgroups are used, the weight value is calculated from the
- * ioprio to export the same interface as CFQ.  When dealing with
- * ``well-behaved'' queues (i.e., queues that do not spend too much
- * time to consume their budget and have true sequential behavior, and
- * when there are no external factors breaking anticipation) the
- * relative weights at each level of the cgroups hierarchy should be
- * guaranteed.  All the fields are protected by the queue lock of the
- * containing bfqd.
- */
-struct bfq_entity {
-	/* service_tree member */
-	struct rb_node rb_node;
-	/* pointer to the weight counter associated with this entity */
-	struct bfq_weight_counter *weight_counter;
-
-	/*
-	 * Flag, true if the entity is on a tree (either the active or
-	 * the idle one of its service_tree) or is in service.
-	 */
-	bool on_st;
-
-	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
-	u64 start, finish;
-
-	/* tree the entity is enqueued into; %NULL if not on a tree */
-	struct rb_root *tree;
-
-	/*
-	 * minimum start time of the (active) subtree rooted at this
-	 * entity; used for O(log N) lookups into active trees
-	 */
-	u64 min_start;
-
-	/* amount of service received during the last service slot */
-	int service;
-
-	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
-	int budget;
-
-	/* weight of the queue */
-	int weight;
-	/* next weight if a change is in progress */
-	int new_weight;
-
-	/* original weight, used to implement weight boosting */
-	int orig_weight;
-
-	/* parent entity, for hierarchical scheduling */
-	struct bfq_entity *parent;
-
-	/*
-	 * For non-leaf nodes in the hierarchy, the associated
-	 * scheduler queue, %NULL on leaf nodes.
-	 */
-	struct bfq_sched_data *my_sched_data;
-	/* the scheduler queue this entity belongs to */
-	struct bfq_sched_data *sched_data;
-
-	/* flag, set to request a weight, ioprio or ioprio_class change  */
-	int prio_changed;
-};
-
-struct bfq_group;
-
-/**
- * struct bfq_ttime - per process thinktime stats.
- */
-struct bfq_ttime {
-	/* completion time of the last request */
-	u64 last_end_request;
-
-	/* total process thinktime */
-	u64 ttime_total;
-	/* number of thinktime samples */
-	unsigned long ttime_samples;
-	/* average process thinktime */
-	u64 ttime_mean;
-};
-
-/**
- * struct bfq_queue - leaf schedulable entity.
- *
- * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it  is  async or shared  between  cooperating
- * processes. @cgroup holds a reference to the cgroup, to be sure that it
- * does not disappear while a bfqq still references it (mostly to avoid
- * races between request issuing and task migration followed by cgroup
- * destruction).
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_queue {
-	/* reference counter */
-	int ref;
-	/* parent bfq_data */
-	struct bfq_data *bfqd;
-
-	/* current ioprio and ioprio class */
-	unsigned short ioprio, ioprio_class;
-	/* next ioprio and ioprio class if a change is in progress */
-	unsigned short new_ioprio, new_ioprio_class;
-
-	/*
-	 * Shared bfq_queue if queue is cooperating with one or more
-	 * other queues.
-	 */
-	struct bfq_queue *new_bfqq;
-	/* request-position tree member (see bfq_group's @rq_pos_tree) */
-	struct rb_node pos_node;
-	/* request-position tree root (see bfq_group's @rq_pos_tree) */
-	struct rb_root *pos_root;
-
-	/* sorted list of pending requests */
-	struct rb_root sort_list;
-	/* if fifo isn't expired, next request to serve */
-	struct request *next_rq;
-	/* number of sync and async requests queued */
-	int queued[2];
-	/* number of requests currently allocated */
-	int allocated;
-	/* number of pending metadata requests */
-	int meta_pending;
-	/* fifo list of requests in sort_list */
-	struct list_head fifo;
-
-	/* entity representing this queue in the scheduler */
-	struct bfq_entity entity;
-
-	/* maximum budget allowed from the feedback mechanism */
-	int max_budget;
-	/* budget expiration (in jiffies) */
-	unsigned long budget_timeout;
-
-	/* number of requests on the dispatch list or inside driver */
-	int dispatched;
-
-	/* status flags */
-	unsigned long flags;
-
-	/* node for active/idle bfqq list inside parent bfqd */
-	struct list_head bfqq_list;
-
-	/* associated @bfq_ttime struct */
-	struct bfq_ttime ttime;
-
-	/* bit vector: a 1 for each seeky requests in history */
-	u32 seek_history;
-
-	/* node for the device's burst list */
-	struct hlist_node burst_list_node;
-
-	/* position of the last request enqueued */
-	sector_t last_request_pos;
-
-	/* Number of consecutive pairs of request completion and
-	 * arrival, such that the queue becomes idle after the
-	 * completion, but the next request arrives within an idle
-	 * time slice; used only if the queue's IO_bound flag has been
-	 * cleared.
-	 */
-	unsigned int requests_within_timer;
-
-	/* pid of the process owning the queue, used for logging purposes */
-	pid_t pid;
-
-	/*
-	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
-	 * if the queue is shared.
-	 */
-	struct bfq_io_cq *bic;
-
-	/* current maximum weight-raising time for this queue */
-	unsigned long wr_cur_max_time;
-	/*
-	 * Minimum time instant such that, only if a new request is
-	 * enqueued after this time instant in an idle @bfq_queue with
-	 * no outstanding requests, then the task associated with the
-	 * queue it is deemed as soft real-time (see the comments on
-	 * the function bfq_bfqq_softrt_next_start())
-	 */
-	unsigned long soft_rt_next_start;
-	/*
-	 * Start time of the current weight-raising period if
-	 * the @bfq-queue is being weight-raised, otherwise
-	 * finish time of the last weight-raising period.
-	 */
-	unsigned long last_wr_start_finish;
-	/* factor by which the weight of this queue is multiplied */
-	unsigned int wr_coeff;
-	/*
-	 * Time of the last transition of the @bfq_queue from idle to
-	 * backlogged.
-	 */
-	unsigned long last_idle_bklogged;
-	/*
-	 * Cumulative service received from the @bfq_queue since the
-	 * last transition from idle to backlogged.
-	 */
-	unsigned long service_from_backlogged;
-
-	/*
-	 * Value of wr start time when switching to soft rt
-	 */
-	unsigned long wr_start_at_switch_to_srt;
-
-	unsigned long split_time; /* time of last split */
-};
-
-/**
- * struct bfq_io_cq - per (request_queue, io_context) structure.
- */
-struct bfq_io_cq {
-	/* associated io_cq structure */
-	struct io_cq icq; /* must be the first member */
-	/* array of two process queues, the sync and the async */
-	struct bfq_queue *bfqq[2];
-	/* per (request_queue, blkcg) ioprio */
-	int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	uint64_t blkcg_serial_nr; /* the current blkcg serial */
-#endif
-	/*
-	 * Snapshot of the idle window before merging; taken to
-	 * remember this value while the queue is merged, so as to be
-	 * able to restore it in case of split.
-	 */
-	bool saved_idle_window;
-	/*
-	 * Same purpose as the previous two fields for the I/O bound
-	 * classification of a queue.
-	 */
-	bool saved_IO_bound;
-
-	/*
-	 * Same purpose as the previous fields for the value of the
-	 * field keeping the queue's belonging to a large burst
-	 */
-	bool saved_in_large_burst;
-	/*
-	 * True if the queue belonged to a burst list before its merge
-	 * with another cooperating queue.
-	 */
-	bool was_in_burst_list;
-
-	/*
-	 * Similar to previous fields: save wr information.
-	 */
-	unsigned long saved_wr_coeff;
-	unsigned long saved_last_wr_start_finish;
-	unsigned long saved_wr_start_at_switch_to_srt;
-	unsigned int saved_wr_cur_max_time;
-	struct bfq_ttime saved_ttime;
-};
-
-enum bfq_device_speed {
-	BFQ_BFQD_FAST,
-	BFQ_BFQD_SLOW,
-};
-
-/**
- * struct bfq_data - per-device data structure.
- *
- * All the fields are protected by @lock.
- */
-struct bfq_data {
-	/* device request queue */
-	struct request_queue *queue;
-	/* dispatch queue */
-	struct list_head dispatch;
-
-	/* root bfq_group for the device */
-	struct bfq_group *root_group;
-
-	/*
-	 * rbtree of weight counters of @bfq_queues, sorted by
-	 * weight. Used to keep track of whether all @bfq_queues have
-	 * the same weight. The tree contains one counter for each
-	 * distinct weight associated to some active and not
-	 * weight-raised @bfq_queue (see the comments to the functions
-	 * bfq_weights_tree_[add|remove] for further details).
-	 */
-	struct rb_root queue_weights_tree;
-	/*
-	 * rbtree of non-queue @bfq_entity weight counters, sorted by
-	 * weight. Used to keep track of whether all @bfq_groups have
-	 * the same weight. The tree contains one counter for each
-	 * distinct weight associated to some active @bfq_group (see
-	 * the comments to the functions bfq_weights_tree_[add|remove]
-	 * for further details).
-	 */
-	struct rb_root group_weights_tree;
-
-	/*
-	 * Number of bfq_queues containing requests (including the
-	 * queue in service, even if it is idling).
-	 */
-	int busy_queues;
-	/* number of weight-raised busy @bfq_queues */
-	int wr_busy_queues;
-	/* number of queued requests */
-	int queued;
-	/* number of requests dispatched and waiting for completion */
-	int rq_in_driver;
-
-	/*
-	 * Maximum number of requests in driver in the last
-	 * @hw_tag_samples completed requests.
-	 */
-	int max_rq_in_driver;
-	/* number of samples used to calculate hw_tag */
-	int hw_tag_samples;
-	/* flag set to one if the driver is showing a queueing behavior */
-	int hw_tag;
-
-	/* number of budgets assigned */
-	int budgets_assigned;
-
-	/*
-	 * Timer set when idling (waiting) for the next request from
-	 * the queue in service.
-	 */
-	struct hrtimer idle_slice_timer;
-
-	/* bfq_queue in service */
-	struct bfq_queue *in_service_queue;
-
-	/* on-disk position of the last served request */
-	sector_t last_position;
-
-	/* time of last request completion (ns) */
-	u64 last_completion;
-
-	/* time of first rq dispatch in current observation interval (ns) */
-	u64 first_dispatch;
-	/* time of last rq dispatch in current observation interval (ns) */
-	u64 last_dispatch;
-
-	/* beginning of the last budget */
-	ktime_t last_budget_start;
-	/* beginning of the last idle slice */
-	ktime_t last_idling_start;
-
-	/* number of samples in current observation interval */
-	int peak_rate_samples;
-	/* num of samples of seq dispatches in current observation interval */
-	u32 sequential_samples;
-	/* total num of sectors transferred in current observation interval */
-	u64 tot_sectors_dispatched;
-	/* max rq size seen during current observation interval (sectors) */
-	u32 last_rq_max_size;
-	/* time elapsed from first dispatch in current observ. interval (us) */
-	u64 delta_from_first;
-	/*
-	 * Current estimate of the device peak rate, measured in
-	 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
-	 * BFQ_RATE_SHIFT is performed to increase precision in
-	 * fixed-point calculations.
-	 */
-	u32 peak_rate;
-
-	/* maximum budget allotted to a bfq_queue before rescheduling */
-	int bfq_max_budget;
-
-	/* list of all the bfq_queues active on the device */
-	struct list_head active_list;
-	/* list of all the bfq_queues idle on the device */
-	struct list_head idle_list;
-
-	/*
-	 * Timeout for async/sync requests; when it fires, requests
-	 * are served in fifo order.
-	 */
-	u64 bfq_fifo_expire[2];
-	/* weight of backward seeks wrt forward ones */
-	unsigned int bfq_back_penalty;
-	/* maximum allowed backward seek */
-	unsigned int bfq_back_max;
-	/* maximum idling time */
-	u32 bfq_slice_idle;
-
-	/* user-configured max budget value (0 for auto-tuning) */
-	int bfq_user_max_budget;
-	/*
-	 * Timeout for bfq_queues to consume their budget; used to
-	 * prevent seeky queues from imposing long latencies to
-	 * sequential or quasi-sequential ones (this also implies that
-	 * seeky queues cannot receive guarantees in the service
-	 * domain; after a timeout they are charged for the time they
-	 * have been in service, to preserve fairness among them, but
-	 * without service-domain guarantees).
-	 */
-	unsigned int bfq_timeout;
-
-	/*
-	 * Number of consecutive requests that must be issued within
-	 * the idle time slice to set again idling to a queue which
-	 * was marked as non-I/O-bound (see the definition of the
-	 * IO_bound flag for further details).
-	 */
-	unsigned int bfq_requests_within_timer;
-
-	/*
-	 * Force device idling whenever needed to provide accurate
-	 * service guarantees, without caring about throughput
-	 * issues. CAVEAT: this may even increase latencies, in case
-	 * of useless idling for processes that did stop doing I/O.
-	 */
-	bool strict_guarantees;
-
-	/*
-	 * Last time at which a queue entered the current burst of
-	 * queues being activated shortly after each other; for more
-	 * details about this and the following parameters related to
-	 * a burst of activations, see the comments on the function
-	 * bfq_handle_burst.
-	 */
-	unsigned long last_ins_in_burst;
-	/*
-	 * Reference time interval used to decide whether a queue has
-	 * been activated shortly after @last_ins_in_burst.
-	 */
-	unsigned long bfq_burst_interval;
-	/* number of queues in the current burst of queue activations */
-	int burst_size;
-
-	/* common parent entity for the queues in the burst */
-	struct bfq_entity *burst_parent_entity;
-	/* Maximum burst size above which the current queue-activation
-	 * burst is deemed as 'large'.
-	 */
-	unsigned long bfq_large_burst_thresh;
-	/* true if a large queue-activation burst is in progress */
-	bool large_burst;
-	/*
-	 * Head of the burst list (as for the above fields, more
-	 * details in the comments on the function bfq_handle_burst).
-	 */
-	struct hlist_head burst_list;
-
-	/* if set to true, low-latency heuristics are enabled */
-	bool low_latency;
-	/*
-	 * Maximum factor by which the weight of a weight-raised queue
-	 * is multiplied.
-	 */
-	unsigned int bfq_wr_coeff;
-	/* maximum duration of a weight-raising period (jiffies) */
-	unsigned int bfq_wr_max_time;
-
-	/* Maximum weight-raising duration for soft real-time processes */
-	unsigned int bfq_wr_rt_max_time;
-	/*
-	 * Minimum idle period after which weight-raising may be
-	 * reactivated for a queue (in jiffies).
-	 */
-	unsigned int bfq_wr_min_idle_time;
-	/*
-	 * Minimum period between request arrivals after which
-	 * weight-raising may be reactivated for an already busy async
-	 * queue (in jiffies).
-	 */
-	unsigned long bfq_wr_min_inter_arr_async;
-
-	/* Max service-rate for a soft real-time queue, in sectors/sec */
-	unsigned int bfq_wr_max_softrt_rate;
-	/*
-	 * Cached value of the product R*T, used for computing the
-	 * maximum duration of weight raising automatically.
-	 */
-	u64 RT_prod;
-	/* device-speed class for the low-latency heuristic */
-	enum bfq_device_speed device_speed;
-
-	/* fallback dummy bfqq for extreme OOM conditions */
-	struct bfq_queue oom_bfqq;
-
-	spinlock_t lock;
-
-	/*
-	 * bic associated with the task issuing current bio for
-	 * merging. This and the next field are used as a support to
-	 * be able to perform the bic lookup, needed by bio-merge
-	 * functions, before the scheduler lock is taken, and thus
-	 * avoid taking the request-queue lock while the scheduler
-	 * lock is being held.
-	 */
-	struct bfq_io_cq *bio_bic;
-	/* bfqq associated with the task issuing current bio for merging */
-	struct bfq_queue *bio_bfqq;
-};
-
-enum bfqq_state_flags {
-	BFQQF_just_created = 0,	/* queue just allocated */
-	BFQQF_busy,		/* has requests or is in service */
-	BFQQF_wait_request,	/* waiting for a request */
-	BFQQF_non_blocking_wait_rq, /*
-				     * waiting for a request
-				     * without idling the device
-				     */
-	BFQQF_fifo_expire,	/* FIFO checked in this slice */
-	BFQQF_idle_window,	/* slice idling enabled */
-	BFQQF_sync,		/* synchronous queue */
-	BFQQF_IO_bound,		/*
-				 * bfqq has timed-out at least once
-				 * having consumed at most 2/10 of
-				 * its budget
-				 */
-	BFQQF_in_large_burst,	/*
-				 * bfqq activated in a large burst,
-				 * see comments to bfq_handle_burst.
-				 */
-	BFQQF_softrt_update,	/*
-				 * may need softrt-next-start
-				 * update
-				 */
-	BFQQF_coop,		/* bfqq is shared */
-	BFQQF_split_coop	/* shared bfqq will be split */
-};
-
-#define BFQ_BFQQ_FNS(name)						\
-static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
-{									\
-	__set_bit(BFQQF_##name, &(bfqq)->flags);			\
-}									\
-static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
-{									\
-	__clear_bit(BFQQF_##name, &(bfqq)->flags);		\
-}									\
-static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
-{									\
-	return test_bit(BFQQF_##name, &(bfqq)->flags);		\
-}
-
-BFQ_BFQQ_FNS(just_created);
-BFQ_BFQQ_FNS(busy);
-BFQ_BFQQ_FNS(wait_request);
-BFQ_BFQQ_FNS(non_blocking_wait_rq);
-BFQ_BFQQ_FNS(fifo_expire);
-BFQ_BFQQ_FNS(idle_window);
-BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(IO_bound);
-BFQ_BFQQ_FNS(in_large_burst);
-BFQ_BFQQ_FNS(coop);
-BFQ_BFQQ_FNS(split_coop);
-BFQ_BFQQ_FNS(softrt_update);
-#undef BFQ_BFQQ_FNS
-
-/* Logging facilities. */
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-			  __pbuf, ##args);				\
-} while (0)
-
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
-} while (0)
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-				##args)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-#define bfq_log(bfqd, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
-
-/* Expiration reasons. */
-enum bfqq_expiration {
-	BFQQE_TOO_IDLE = 0,		/*
-					 * queue has been idling for
-					 * too long
-					 */
-	BFQQE_BUDGET_TIMEOUT,	/* budget took too long to be used */
-	BFQQE_BUDGET_EXHAUSTED,	/* budget consumed */
-	BFQQE_NO_MORE_REQUESTS,	/* the queue has no more requests */
-	BFQQE_PREEMPTED		/* preemption in progress */
-};
-
-struct bfqg_stats {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	/* number of ios merged */
-	struct blkg_rwstat		merged;
-	/* total time spent on device in ns, may not be accurate w/ queueing */
-	struct blkg_rwstat		service_time;
-	/* total time spent waiting in scheduler queue in ns */
-	struct blkg_rwstat		wait_time;
-	/* number of IOs queued up */
-	struct blkg_rwstat		queued;
-	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
-	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
-	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
-	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
-	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkcg_gq */
-	struct blkg_stat		idle_time;
-	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
-	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
-	uint16_t			flags;
-#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
-};
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-/*
- * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
- *
- * @ps: @blkcg_policy_storage that this structure inherits
- * @weight: weight of the bfq_group
- */
-struct bfq_group_data {
-	/* must be the first member */
-	struct blkcg_policy_data pd;
-
-	unsigned int weight;
-};
-
-/**
- * struct bfq_group - per (device, cgroup) data structure.
- * @entity: schedulable entity to insert into the parent group sched_data.
- * @sched_data: own sched_data, to contain child entities (they may be
- *              both bfq_queues and bfq_groups).
- * @bfqd: the bfq_data for the device this group acts upon.
- * @async_bfqq: array of async queues for all the tasks belonging to
- *              the group, one queue per ioprio value per ioprio_class,
- *              except for the idle class that has only one queue.
- * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
- * @my_entity: pointer to @entity, %NULL for the toplevel group; used
- *             to avoid too many special cases during group creation/
- *             migration.
- * @stats: stats for this bfqg.
- * @active_entities: number of active entities belonging to the group;
- *                   unused for the root group. Used to know whether there
- *                   are groups with more than one active @bfq_entity
- *                   (see the comments to the function
- *                   bfq_bfqq_may_idle()).
- * @rq_pos_tree: rbtree sorted by next_request position, used when
- *               determining if two or more queues have interleaving
- *               requests (see bfq_find_close_cooperator()).
- *
- * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
- * there is a set of bfq_groups, each one collecting the lower-level
- * entities belonging to the group that are acting on the same device.
- *
- * Locking works as follows:
- *    o @bfqd is protected by the queue lock, RCU is used to access it
- *      from the readers.
- *    o All the other fields are protected by the @bfqd queue lock.
- */
-struct bfq_group {
-	/* must be the first member */
-	struct blkg_policy_data pd;
-
-	struct bfq_entity entity;
-	struct bfq_sched_data sched_data;
-
-	void *bfqd;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-
-	struct bfq_entity *my_entity;
-
-	int active_entities;
-
-	struct rb_root rq_pos_tree;
-
-	struct bfqg_stats stats;
-};
-
-#else
-struct bfq_group {
-	struct bfq_sched_data sched_data;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-
-	struct rb_root rq_pos_tree;
-};
-#endif
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
-static unsigned int bfq_class_idx(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	return bfqq ? bfqq->ioprio_class - 1 :
-		BFQ_DEFAULT_GRP_CLASS - 1;
-}
-
-static struct bfq_service_tree *
-bfq_entity_service_tree(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sched_data = entity->sched_data;
-	unsigned int idx = bfq_class_idx(entity);
-
-	return sched_data->service_tree + idx;
-}
-
-static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
-{
-	return bic->bfqq[is_sync];
-}
-
-static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
-			 bool is_sync)
-{
-	bic->bfqq[is_sync] = bfqq;
-}
-
-static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
-{
-	return bic->icq.q->elevator->elevator_data;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	if (!group_entity)
-		group_entity = &bfqq->bfqd->root_group->entity;
-
-	return container_of(group_entity, struct bfq_group, entity);
-}
-
-#else
-
-static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
-#endif
-
-static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
-static void bfq_put_queue(struct bfq_queue *bfqq);
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, bool is_sync,
-				       struct bfq_io_cq *bic);
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
-				    struct bfq_group *bfqg);
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-
-/* Expiration time of sync (0) and async (1) requests, in ns. */
-static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
-
-/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
-static const int bfq_back_max = 16 * 1024;
-
-/* Penalty of a backwards seek, in number of sectors. */
-static const int bfq_back_penalty = 2;
-
-/* Idling period duration, in ns. */
-static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
-
-/* Minimum number of assigned budgets for which stats are safe to compute. */
-static const int bfq_stats_min_budgets = 194;
-
-/* Default maximum budget values, in sectors and number of requests. */
-static const int bfq_default_max_budget = 16 * 1024;
-
-/*
- * Async to sync throughput distribution is controlled as follows:
- * when an async request is served, the entity is charged the number
- * of sectors of the request, multiplied by the factor below
- */
-static const int bfq_async_charge_factor = 10;
-
-/* Default timeout values, in jiffies, approximating CFQ defaults. */
-static const int bfq_timeout = HZ / 8;
-
-static struct kmem_cache *bfq_pool;
-
-/* Below this threshold (in ns), we consider thinktime immediate. */
-#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
-
-/* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD	4
-#define BFQ_HW_QUEUE_SAMPLES	32
-
-#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
-#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
-#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
-
-/* Min number of samples required to perform peak-rate update */
-#define BFQ_RATE_MIN_SAMPLES	32
-/* Min observation time interval required to perform a peak-rate update (ns) */
-#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
-/* Target observation time interval for a peak-rate update (ns) */
-#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
-
-/* Shift used for peak rate fixed precision calculations. */
-#define BFQ_RATE_SHIFT		16
-
-/*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
- *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
- *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
- *
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
- */
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
-/*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
- */
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
-
-#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
-				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
-
-#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
-#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
-
-/**
- * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
- * @icq: the iocontext queue.
- */
-static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
-{
-	/* bic->icq is the first member, %NULL will convert to %NULL */
-	return container_of(icq, struct bfq_io_cq, icq);
-}
-
-/**
- * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
- * @q: the request queue.
- */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
-					struct io_context *ioc,
-					struct request_queue *q)
-{
-	if (ioc) {
-		unsigned long flags;
-		struct bfq_io_cq *icq;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
-		return icq;
-	}
-
-	return NULL;
-}
-
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		blk_mq_run_hw_queues(bfqd->queue, true);
-	}
-}
-
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
-static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
-{
-	struct rb_node *node = tree->rb_node;
-
-	return rb_entry(node, struct bfq_entity, rb_node);
-}
-
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
-
-/**
- * bfq_update_next_in_service - update sd->next_in_service
- * @sd: sched_data for which to perform the update.
- * @new_entity: if not NULL, pointer to the entity whose activation,
- *		requeueing or repositionig triggered the invocation of
- *		this function.
- *
- * This function is called to update sd->next_in_service, which, in
- * its turn, may change as a consequence of the insertion or
- * extraction of an entity into/from one of the active trees of
- * sd. These insertions/extractions occur as a consequence of
- * activations/deactivations of entities, with some activations being
- * 'true' activations, and other activations being requeueings (i.e.,
- * implementing the second, requeueing phase of the mechanism used to
- * reposition an entity in its active tree; see comments on
- * __bfq_activate_entity and __bfq_requeue_entity for details). In
- * both the last two activation sub-cases, new_entity points to the
- * just activated or requeued entity.
- *
- * Returns true if sd->next_in_service changes in such a way that
- * entity->parent may become the next_in_service for its parent
- * entity.
- */
-static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
-				       struct bfq_entity *new_entity)
-{
-	struct bfq_entity *next_in_service = sd->next_in_service;
-	bool parent_sched_may_change = false;
-
-	/*
-	 * If this update is triggered by the activation, requeueing
-	 * or repositiong of an entity that does not coincide with
-	 * sd->next_in_service, then a full lookup in the active tree
-	 * can be avoided. In fact, it is enough to check whether the
-	 * just-modified entity has a higher priority than
-	 * sd->next_in_service, or, even if it has the same priority
-	 * as sd->next_in_service, is eligible and has a lower virtual
-	 * finish time than sd->next_in_service. If this compound
-	 * condition holds, then the new entity becomes the new
-	 * next_in_service. Otherwise no change is needed.
-	 */
-	if (new_entity && new_entity != sd->next_in_service) {
-		/*
-		 * Flag used to decide whether to replace
-		 * sd->next_in_service with new_entity. Tentatively
-		 * set to true, and left as true if
-		 * sd->next_in_service is NULL.
-		 */
-		bool replace_next = true;
-
-		/*
-		 * If there is already a next_in_service candidate
-		 * entity, then compare class priorities or timestamps
-		 * to decide whether to replace sd->service_tree with
-		 * new_entity.
-		 */
-		if (next_in_service) {
-			unsigned int new_entity_class_idx =
-				bfq_class_idx(new_entity);
-			struct bfq_service_tree *st =
-				sd->service_tree + new_entity_class_idx;
-
-			/*
-			 * For efficiency, evaluate the most likely
-			 * sub-condition first.
-			 */
-			replace_next =
-				(new_entity_class_idx ==
-				 bfq_class_idx(next_in_service)
-				 &&
-				 !bfq_gt(new_entity->start, st->vtime)
-				 &&
-				 bfq_gt(next_in_service->finish,
-					new_entity->finish))
-				||
-				new_entity_class_idx <
-				bfq_class_idx(next_in_service);
-		}
-
-		if (replace_next)
-			next_in_service = new_entity;
-	} else /* invoked because of a deactivation: lookup needed */
-		next_in_service = bfq_lookup_next_entity(sd);
-
-	if (next_in_service) {
-		parent_sched_may_change = !sd->next_in_service ||
-			bfq_update_parent_budget(next_in_service);
-	}
-
-	sd->next_in_service = next_in_service;
-
-	if (!next_in_service)
-		return parent_sched_may_change;
-
-	return parent_sched_may_change;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-/* both next loops stop at one of the child entities of the root group */
-#define for_each_entity(entity)	\
-	for (; entity ; entity = entity->parent)
-
-/*
- * For each iteration, compute parent in advance, so as to be safe if
- * entity is deallocated during the iteration. Such a deallocation may
- * happen as a consequence of a bfq_put_queue that frees the bfq_queue
- * containing entity.
- */
-#define for_each_entity_safe(entity, parent) \
-	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
-
-/*
- * Returns true if this budget changes may let next_in_service->parent
- * become the next_in_service entity for its parent entity.
- */
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
-	struct bfq_entity *bfqg_entity;
-	struct bfq_group *bfqg;
-	struct bfq_sched_data *group_sd;
-	bool ret = false;
-
-	group_sd = next_in_service->sched_data;
-
-	bfqg = container_of(group_sd, struct bfq_group, sched_data);
-	/*
-	 * bfq_group's my_entity field is not NULL only if the group
-	 * is not the root group. We must not touch the root entity
-	 * as it must never become an in-service entity.
-	 */
-	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity) {
-		if (bfqg_entity->budget > next_in_service->budget)
-			ret = true;
-		bfqg_entity->budget = next_in_service->budget;
-	}
-
-	return ret;
-}
-
-/*
- * This function tells whether entity stops being a candidate for next
- * service, according to the following logic.
- *
- * This function is invoked for an entity that is about to be set in
- * service. If such an entity is a queue, then the entity is no longer
- * a candidate for next service (i.e, a candidate entity to serve
- * after the in-service entity is expired). The function then returns
- * true.
- *
- * In contrast, the entity could stil be a candidate for next service
- * if it is not a queue, and has more than one child. In fact, even if
- * one of its children is about to be set in service, other children
- * may still be the next to serve. As a consequence, a non-queue
- * entity is not a candidate for next-service only if it has only one
- * child. And only if this condition holds, then the function returns
- * true for a non-queue entity.
- */
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
-	struct bfq_group *bfqg;
-
-	if (bfq_entity_to_bfqq(entity))
-		return true;
-
-	bfqg = container_of(entity, struct bfq_group, entity);
-
-	if (bfqg->active_entities == 1)
-		return true;
-
-	return false;
-}
-
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
-/*
- * Next two macros are fake loops when cgroups support is not
- * enabled. I fact, in such a case, there is only one level to go up
- * (to reach the root group).
- */
-#define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
-
-#define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
-
-static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
-{
-	return false;
-}
-
-static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
-{
-	return true;
-}
-
-#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-
-/*
- * Shift for timestamp calculations.  This actually limits the maximum
- * service allowed in one timestamp delta (small shift values increase it),
- * the maximum total weight that can be used for the queues in the system
- * (big shift values increase it), and the period of virtual time
- * wraparounds.
- */
-#define WFQ_SERVICE_SHIFT	22
-
-static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = NULL;
-
-	if (!entity->my_sched_data)
-		bfqq = container_of(entity, struct bfq_queue, entity);
-
-	return bfqq;
-}
-
-
-/**
- * bfq_delta - map service into the virtual time domain.
- * @service: amount of service.
- * @weight: scale factor (weight of an entity or weight sum).
- */
-static u64 bfq_delta(unsigned long service, unsigned long weight)
-{
-	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
-
-	do_div(d, weight);
-	return d;
-}
-
-/**
- * bfq_calc_finish - assign the finish time to an entity.
- * @entity: the entity to act upon.
- * @service: the service to be charged to the entity.
- */
-static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->finish = entity->start +
-		bfq_delta(service, entity->weight);
-
-	if (bfqq) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: serv %lu, w %d",
-			service, entity->weight);
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: start %llu, finish %llu, delta %llu",
-			entity->start, entity->finish,
-			bfq_delta(service, entity->weight));
-	}
-}
-
-/**
- * bfq_entity_of - get an entity from a node.
- * @node: the node field of the entity.
- *
- * Convert a node pointer to the relative entity.  This is used only
- * to simplify the logic of some functions and not as the generic
- * conversion mechanism because, e.g., in the tree walking functions,
- * the check for a %NULL value would be redundant.
- */
-static struct bfq_entity *bfq_entity_of(struct rb_node *node)
-{
-	struct bfq_entity *entity = NULL;
-
-	if (node)
-		entity = rb_entry(node, struct bfq_entity, rb_node);
-
-	return entity;
-}
-
-/**
- * bfq_extract - remove an entity from a tree.
- * @root: the tree root.
- * @entity: the entity to remove.
- */
-static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
-{
-	entity->tree = NULL;
-	rb_erase(&entity->rb_node, root);
-}
-
-/**
- * bfq_idle_extract - extract an entity from the idle tree.
- * @st: the service tree of the owning @entity.
- * @entity: the entity being removed.
- */
-static void bfq_idle_extract(struct bfq_service_tree *st,
-			     struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *next;
-
-	if (entity == st->first_idle) {
-		next = rb_next(&entity->rb_node);
-		st->first_idle = bfq_entity_of(next);
-	}
-
-	if (entity == st->last_idle) {
-		next = rb_prev(&entity->rb_node);
-		st->last_idle = bfq_entity_of(next);
-	}
-
-	bfq_extract(&st->idle, entity);
-
-	if (bfqq)
-		list_del(&bfqq->bfqq_list);
-}
-
-/**
- * bfq_insert - generic tree insertion.
- * @root: tree root.
- * @entity: entity to insert.
- *
- * This is used for the idle and the active tree, since they are both
- * ordered by finish time.
- */
-static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
-{
-	struct bfq_entity *entry;
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*node) {
-		parent = *node;
-		entry = rb_entry(parent, struct bfq_entity, rb_node);
-
-		if (bfq_gt(entry->finish, entity->finish))
-			node = &parent->rb_left;
-		else
-			node = &parent->rb_right;
-	}
-
-	rb_link_node(&entity->rb_node, parent, node);
-	rb_insert_color(&entity->rb_node, root);
-
-	entity->tree = root;
-}
-
-/**
- * bfq_update_min - update the min_start field of a entity.
- * @entity: the entity to update.
- * @node: one of its children.
- *
- * This function is called when @entity may store an invalid value for
- * min_start due to updates to the active tree.  The function  assumes
- * that the subtree rooted at @node (which may be its left or its right
- * child) has a valid min_start value.
- */
-static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
-{
-	struct bfq_entity *child;
-
-	if (node) {
-		child = rb_entry(node, struct bfq_entity, rb_node);
-		if (bfq_gt(entity->min_start, child->min_start))
-			entity->min_start = child->min_start;
-	}
-}
-
-/**
- * bfq_update_active_node - recalculate min_start.
- * @node: the node to update.
- *
- * @node may have changed position or one of its children may have moved,
- * this function updates its min_start value.  The left and right subtrees
- * are assumed to hold a correct min_start value.
- */
-static void bfq_update_active_node(struct rb_node *node)
-{
-	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
-
-	entity->min_start = entity->start;
-	bfq_update_min(entity, node->rb_right);
-	bfq_update_min(entity, node->rb_left);
-}
-
-/**
- * bfq_update_active_tree - update min_start for the whole active tree.
- * @node: the starting node.
- *
- * @node must be the deepest modified node after an update.  This function
- * updates its min_start using the values held by its children, assuming
- * that they did not change, and then updates all the nodes that may have
- * changed in the path to the root.  The only nodes that may have changed
- * are the ones in the path or their siblings.
- */
-static void bfq_update_active_tree(struct rb_node *node)
-{
-	struct rb_node *parent;
-
-up:
-	bfq_update_active_node(node);
-
-	parent = rb_parent(node);
-	if (!parent)
-		return;
-
-	if (node == parent->rb_left && parent->rb_right)
-		bfq_update_active_node(parent->rb_right);
-	else if (parent->rb_left)
-		bfq_update_active_node(parent->rb_left);
-
-	node = parent;
-	goto up;
-}
-
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
-				 struct bfq_entity *entity,
-				 struct rb_root *root);
-
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
-				    struct bfq_entity *entity,
-				    struct rb_root *root);
-
-
-/**
- * bfq_active_insert - insert an entity in the active tree of its
- *                     group/device.
- * @st: the service tree of the entity.
- * @entity: the entity being inserted.
- *
- * The active tree is ordered by finish time, but an extra key is kept
- * per each node, containing the minimum value for the start times of
- * its children (and the node itself), so it's possible to search for
- * the eligible node with the lowest finish time in logarithmic time.
- */
-static void bfq_active_insert(struct bfq_service_tree *st,
-			      struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_sched_data *sd = NULL;
-	struct bfq_group *bfqg = NULL;
-	struct bfq_data *bfqd = NULL;
-#endif
-
-	bfq_insert(&st->active, entity);
-
-	if (node->rb_left)
-		node = node->rb_left;
-	else if (node->rb_right)
-		node = node->rb_right;
-
-	bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	sd = entity->sched_data;
-	bfqg = container_of(sd, struct bfq_group, sched_data);
-	bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
-	if (bfqq)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else /* bfq_group */
-		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
-
-	if (bfqg != bfqd->root_group)
-		bfqg->active_entities++;
-#endif
-}
-
-/**
- * bfq_ioprio_to_weight - calc a weight from an ioprio.
- * @ioprio: the ioprio value to convert.
- */
-static unsigned short bfq_ioprio_to_weight(int ioprio)
-{
-	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
-}
-
-/**
- * bfq_weight_to_ioprio - calc an ioprio from a weight.
- * @weight: the weight value to convert.
- *
- * To preserve as much as possible the old only-ioprio user interface,
- * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
- */
-static unsigned short bfq_weight_to_ioprio(int weight)
-{
-	return max_t(int, 0,
-		     IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
-}
-
-static void bfq_get_entity(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	if (bfqq) {
-		bfqq->ref++;
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
-			     bfqq, bfqq->ref);
-	}
-}
-
-/**
- * bfq_find_deepest - find the deepest node that an extraction can modify.
- * @node: the node being removed.
- *
- * Do the first step of an extraction in an rb tree, looking for the
- * node that will replace @node, and returning the deepest node that
- * the following modifications to the tree can touch.  If @node is the
- * last node in the tree return %NULL.
- */
-static struct rb_node *bfq_find_deepest(struct rb_node *node)
-{
-	struct rb_node *deepest;
-
-	if (!node->rb_right && !node->rb_left)
-		deepest = rb_parent(node);
-	else if (!node->rb_right)
-		deepest = node->rb_left;
-	else if (!node->rb_left)
-		deepest = node->rb_right;
-	else {
-		deepest = rb_next(node);
-		if (deepest->rb_right)
-			deepest = deepest->rb_right;
-		else if (rb_parent(deepest) != node)
-			deepest = rb_parent(deepest);
-	}
-
-	return deepest;
-}
-
-/**
- * bfq_active_extract - remove an entity from the active tree.
- * @st: the service_tree containing the tree.
- * @entity: the entity being removed.
- */
-static void bfq_active_extract(struct bfq_service_tree *st,
-			       struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	struct bfq_sched_data *sd = NULL;
-	struct bfq_group *bfqg = NULL;
-	struct bfq_data *bfqd = NULL;
-#endif
-
-	node = bfq_find_deepest(&entity->rb_node);
-	bfq_extract(&st->active, entity);
-
-	if (node)
-		bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	sd = entity->sched_data;
-	bfqg = container_of(sd, struct bfq_group, sched_data);
-	bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
-	if (bfqq)
-		list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	else /* bfq_group */
-		bfq_weights_tree_remove(bfqd, entity,
-					&bfqd->group_weights_tree);
-
-	if (bfqg != bfqd->root_group)
-		bfqg->active_entities--;
-#endif
-}
-
-/**
- * bfq_idle_insert - insert an entity into the idle tree.
- * @st: the service tree containing the tree.
- * @entity: the entity to insert.
- */
-static void bfq_idle_insert(struct bfq_service_tree *st,
-			    struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
-		st->first_idle = entity;
-	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
-		st->last_idle = entity;
-
-	bfq_insert(&st->idle, entity);
-
-	if (bfqq)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
-}
-
-/**
- * bfq_forget_entity - do not consider entity any longer for scheduling
- * @st: the service tree.
- * @entity: the entity being removed.
- * @is_in_service: true if entity is currently the in-service entity.
- *
- * Forget everything about @entity. In addition, if entity represents
- * a queue, and the latter is not in service, then release the service
- * reference to the queue (the one taken through bfq_get_entity). In
- * fact, in this case, there is really no more service reference to
- * the queue, as the latter is also outside any service tree. If,
- * instead, the queue is in service, then __bfq_bfqd_reset_in_service
- * will take care of putting the reference when the queue finally
- * stops being served.
- */
-static void bfq_forget_entity(struct bfq_service_tree *st,
-			      struct bfq_entity *entity,
-			      bool is_in_service)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->on_st = false;
-	st->wsum -= entity->weight;
-	if (bfqq && !is_in_service)
-		bfq_put_queue(bfqq);
-}
-
-/**
- * bfq_put_idle_entity - release the idle tree ref of an entity.
- * @st: service tree for the entity.
- * @entity: the entity being released.
- */
-static void bfq_put_idle_entity(struct bfq_service_tree *st,
-				struct bfq_entity *entity)
-{
-	bfq_idle_extract(st, entity);
-	bfq_forget_entity(st, entity,
-			  entity == entity->sched_data->in_service_entity);
-}
-
-/**
- * bfq_forget_idle - update the idle tree if necessary.
- * @st: the service tree to act upon.
- *
- * To preserve the global O(log N) complexity we only remove one entry here;
- * as the idle tree will not grow indefinitely this can be done safely.
- */
-static void bfq_forget_idle(struct bfq_service_tree *st)
-{
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
-	    !bfq_gt(last_idle->finish, st->vtime)) {
-		/*
-		 * Forget the whole idle tree, increasing the vtime past
-		 * the last finish time of idle entities.
-		 */
-		st->vtime = last_idle->finish;
-	}
-
-	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
-		bfq_put_idle_entity(st, first_idle);
-}
-
-static struct bfq_service_tree *
-__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-				struct bfq_entity *entity)
-{
-	struct bfq_service_tree *new_st = old_st;
-
-	if (entity->prio_changed) {
-		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-		unsigned int prev_weight, new_weight;
-		struct bfq_data *bfqd = NULL;
-		struct rb_root *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		struct bfq_sched_data *sd;
-		struct bfq_group *bfqg;
-#endif
-
-		if (bfqq)
-			bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		else {
-			sd = entity->my_sched_data;
-			bfqg = container_of(sd, struct bfq_group, sched_data);
-			bfqd = (struct bfq_data *)bfqg->bfqd;
-		}
-#endif
-
-		old_st->wsum -= entity->weight;
-
-		if (entity->new_weight != entity->orig_weight) {
-			if (entity->new_weight < BFQ_MIN_WEIGHT ||
-			    entity->new_weight > BFQ_MAX_WEIGHT) {
-				pr_crit("update_weight_prio: new_weight %d\n",
-					entity->new_weight);
-				if (entity->new_weight < BFQ_MIN_WEIGHT)
-					entity->new_weight = BFQ_MIN_WEIGHT;
-				else
-					entity->new_weight = BFQ_MAX_WEIGHT;
-			}
-			entity->orig_weight = entity->new_weight;
-			if (bfqq)
-				bfqq->ioprio =
-				  bfq_weight_to_ioprio(entity->orig_weight);
-		}
-
-		if (bfqq)
-			bfqq->ioprio_class = bfqq->new_ioprio_class;
-		entity->prio_changed = 0;
-
-		/*
-		 * NOTE: here we may be changing the weight too early,
-		 * this will cause unfairness.  The correct approach
-		 * would have required additional complexity to defer
-		 * weight changes to the proper time instants (i.e.,
-		 * when entity->finish <= old_st->vtime).
-		 */
-		new_st = bfq_entity_service_tree(entity);
-
-		prev_weight = entity->weight;
-		new_weight = entity->orig_weight *
-			     (bfqq ? bfqq->wr_coeff : 1);
-		/*
-		 * If the weight of the entity changes, remove the entity
-		 * from its old weight counter (if there is a counter
-		 * associated with the entity), and add it to the counter
-		 * associated with its new weight.
-		 */
-		if (prev_weight != new_weight) {
-			root = bfqq ? &bfqd->queue_weights_tree :
-				      &bfqd->group_weights_tree;
-			bfq_weights_tree_remove(bfqd, entity, root);
-		}
-		entity->weight = new_weight;
-		/*
-		 * Add the entity to its weights tree only if it is
-		 * not associated with a weight-raised queue.
-		 */
-		if (prev_weight != new_weight &&
-		    (bfqq ? bfqq->wr_coeff == 1 : 1))
-			/* If we get here, root has been initialized. */
-			bfq_weights_tree_add(bfqd, entity, root);
-
-		new_st->wsum += entity->weight;
-
-		if (new_st != old_st)
-			entity->start = new_st->vtime;
-	}
-
-	return new_st;
-}
-
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-/**
- * bfq_bfqq_served - update the scheduler status after selection for
- *                   service.
- * @bfqq: the queue being served.
- * @served: bytes to transfer.
- *
- * NOTE: this can be optimized, as the timestamps of upper level entities
- * are synchronized every time a new bfqq is selected for service.  By now,
- * we keep it to better check consistency.
- */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st;
-
-	for_each_entity(entity) {
-		st = bfq_entity_service_tree(entity);
-
-		entity->service += served;
-
-		st->vtime += bfq_delta(served, st->wsum);
-		bfq_forget_idle(st);
-	}
-	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
-}
-
-/**
- * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
- *			  of the time interval during which bfqq has been in
- *			  service.
- * @bfqd: the device
- * @bfqq: the queue that needs a service update.
- * @time_ms: the amount of time during which the queue has received service
- *
- * If a queue does not consume its budget fast enough, then providing
- * the queue with service fairness may impair throughput, more or less
- * severely. For this reason, queues that consume their budget slowly
- * are provided with time fairness instead of service fairness. This
- * goal is achieved through the BFQ scheduling engine, even if such an
- * engine works in the service, and not in the time domain. The trick
- * is charging these queues with an inflated amount of service, equal
- * to the amount of service that they would have received during their
- * service slot if they had been fast, i.e., if their requests had
- * been dispatched at a rate equal to the estimated peak rate.
- *
- * It is worth noting that time fairness can cause important
- * distortions in terms of bandwidth distribution, on devices with
- * internal queueing. The reason is that I/O requests dispatched
- * during the service slot of a queue may be served after that service
- * slot is finished, and may have a total processing time loosely
- * correlated with the duration of the service slot. This is
- * especially true for short service slots.
- */
-static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				 unsigned long time_ms)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	int tot_serv_to_charge = entity->service;
-	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
-
-	if (time_ms > 0 && time_ms < timeout_ms)
-		tot_serv_to_charge =
-			(bfqd->bfq_max_budget * time_ms) / timeout_ms;
-
-	if (tot_serv_to_charge < entity->service)
-		tot_serv_to_charge = entity->service;
-
-	/* Increase budget to avoid inconsistencies */
-	if (tot_serv_to_charge > entity->budget)
-		entity->budget = tot_serv_to_charge;
-
-	bfq_bfqq_served(bfqq,
-			max_t(int, 0, tot_serv_to_charge - entity->service));
-}
-
-static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
-					struct bfq_service_tree *st,
-					bool backshifted)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	st = __bfq_entity_update_weight_prio(st, entity);
-	bfq_calc_finish(entity, entity->budget);
-
-	/*
-	 * If some queues enjoy backshifting for a while, then their
-	 * (virtual) finish timestamps may happen to become lower and
-	 * lower than the system virtual time.	In particular, if
-	 * these queues often happen to be idle for short time
-	 * periods, and during such time periods other queues with
-	 * higher timestamps happen to be busy, then the backshifted
-	 * timestamps of the former queues can become much lower than
-	 * the system virtual time. In fact, to serve the queues with
-	 * higher timestamps while the ones with lower timestamps are
-	 * idle, the system virtual time may be pushed-up to much
-	 * higher values than the finish timestamps of the idle
-	 * queues. As a consequence, the finish timestamps of all new
-	 * or newly activated queues may end up being much larger than
-	 * those of lucky queues with backshifted timestamps. The
-	 * latter queues may then monopolize the device for a lot of
-	 * time. This would simply break service guarantees.
-	 *
-	 * To reduce this problem, push up a little bit the
-	 * backshifted timestamps of the queue associated with this
-	 * entity (only a queue can happen to have the backshifted
-	 * flag set): just enough to let the finish timestamp of the
-	 * queue be equal to the current value of the system virtual
-	 * time. This may introduce a little unfairness among queues
-	 * with backshifted timestamps, but it does not break
-	 * worst-case fairness guarantees.
-	 *
-	 * As a special case, if bfqq is weight-raised, push up
-	 * timestamps much less, to keep very low the probability that
-	 * this push up causes the backshifted finish timestamps of
-	 * weight-raised queues to become higher than the backshifted
-	 * finish timestamps of non weight-raised queues.
-	 */
-	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
-		unsigned long delta = st->vtime - entity->finish;
-
-		if (bfqq)
-			delta /= bfqq->wr_coeff;
-
-		entity->start += delta;
-		entity->finish += delta;
-	}
-
-	bfq_active_insert(st, entity);
-}
-
-/**
- * __bfq_activate_entity - handle activation of entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if entity was waiting for a request
- *
- * Called for a 'true' activation, i.e., if entity is not active and
- * one of its children receives a new request.
- *
- * Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possible extracting it
- * from its idle tree.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
-				  bool non_blocking_wait_rq)
-{
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	bool backshifted = false;
-	unsigned long long min_vstart;
-
-	/* See comments on bfq_fqq_update_budg_for_activation */
-	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
-		backshifted = true;
-		min_vstart = entity->finish;
-	} else
-		min_vstart = st->vtime;
-
-	if (entity->tree == &st->idle) {
-		/*
-		 * Must be on the idle tree, bfq_idle_extract() will
-		 * check for that.
-		 */
-		bfq_idle_extract(st, entity);
-		entity->start = bfq_gt(min_vstart, entity->finish) ?
-			min_vstart : entity->finish;
-	} else {
-		/*
-		 * The finish time of the entity may be invalid, and
-		 * it is in the past for sure, otherwise the queue
-		 * would have been on the idle tree.
-		 */
-		entity->start = min_vstart;
-		st->wsum += entity->weight;
-		/*
-		 * entity is about to be inserted into a service tree,
-		 * and then set in service: get a reference to make
-		 * sure entity does not disappear until it is no
-		 * longer in service or scheduled for service.
-		 */
-		bfq_get_entity(entity);
-
-		entity->on_st = true;
-	}
-
-	bfq_update_fin_time_enqueue(entity, st, backshifted);
-}
-
-/**
- * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
- * @entity: the entity being requeued or repositioned.
- *
- * Requeueing is needed if this entity stops being served, which
- * happens if a leaf descendant entity has expired. On the other hand,
- * repositioning is needed if the next_inservice_entity for the child
- * entity has changed. See the comments inside the function for
- * details.
- *
- * Basically, this function: 1) removes entity from its active tree if
- * present there, 2) updates the timestamps of entity and 3) inserts
- * entity back into its active tree (in the new, right position for
- * the new values of the timestamps).
- */
-static void __bfq_requeue_entity(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	if (entity == sd->in_service_entity) {
-		/*
-		 * We are requeueing the current in-service entity,
-		 * which may have to be done for one of the following
-		 * reasons:
-		 * - entity represents the in-service queue, and the
-		 *   in-service queue is being requeued after an
-		 *   expiration;
-		 * - entity represents a group, and its budget has
-		 *   changed because one of its child entities has
-		 *   just been either activated or requeued for some
-		 *   reason; the timestamps of the entity need then to
-		 *   be updated, and the entity needs to be enqueued
-		 *   or repositioned accordingly.
-		 *
-		 * In particular, before requeueing, the start time of
-		 * the entity must be moved forward to account for the
-		 * service that the entity has received while in
-		 * service. This is done by the next instructions. The
-		 * finish time will then be updated according to this
-		 * new value of the start time, and to the budget of
-		 * the entity.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		/*
-		 * In addition, if the entity had more than one child
-		 * when set in service, then was not extracted from
-		 * the active tree. This implies that the position of
-		 * the entity in the active tree may need to be
-		 * changed now, because we have just updated the start
-		 * time of the entity, and we will update its finish
-		 * time in a moment (the requeueing is then, more
-		 * precisely, a repositioning in this case). To
-		 * implement this repositioning, we: 1) dequeue the
-		 * entity here, 2) update the finish time and
-		 * requeue the entity according to the new
-		 * timestamps below.
-		 */
-		if (entity->tree)
-			bfq_active_extract(st, entity);
-	} else { /* The entity is already active, and not in service */
-		/*
-		 * In this case, this function gets called only if the
-		 * next_in_service entity below this entity has
-		 * changed, and this change has caused the budget of
-		 * this entity to change, which, finally implies that
-		 * the finish time of this entity must be
-		 * updated. Such an update may cause the scheduling,
-		 * i.e., the position in the active tree, of this
-		 * entity to change. We handle this change by: 1)
-		 * dequeueing the entity here, 2) updating the finish
-		 * time and requeueing the entity according to the new
-		 * timestamps below. This is the same approach as the
-		 * non-extracted-entity sub-case above.
-		 */
-		bfq_active_extract(st, entity);
-	}
-
-	bfq_update_fin_time_enqueue(entity, st, false);
-}
-
-static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
-					  struct bfq_sched_data *sd,
-					  bool non_blocking_wait_rq)
-{
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	if (sd->in_service_entity == entity || entity->tree == &st->active)
-		 /*
-		  * in service or already queued on the active tree,
-		  * requeue or reposition
-		  */
-		__bfq_requeue_entity(entity);
-	else
-		/*
-		 * Not in service and not queued on its active tree:
-		 * the activity is idle and this is a true activation.
-		 */
-		__bfq_activate_entity(entity, non_blocking_wait_rq);
-}
-
-
-/**
- * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
- *			 and activate, requeue or reposition all ancestors
- *			 for which such an update becomes necessary.
- * @entity: the entity to activate.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- * @requeue: true if this is a requeue, which implies that bfqq is
- *	     being expired; thus ALL its ancestors stop being served and must
- *	     therefore be requeued
- */
-static void bfq_activate_requeue_entity(struct bfq_entity *entity,
-					bool non_blocking_wait_rq,
-					bool requeue)
-{
-	struct bfq_sched_data *sd;
-
-	for_each_entity(entity) {
-		sd = entity->sched_data;
-		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
-		if (!bfq_update_next_in_service(sd, entity) && !requeue)
-			break;
-	}
-}
-
-/**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: if false, the entity will not be put into the
- *			idle tree.
- *
- * Deactivates an entity, independently from its previous state.  Must
- * be invoked only if entity is on a service tree. Extracts the entity
- * from that tree, and if necessary and allowed, puts it on the idle
- * tree.
- */
-static bool __bfq_deactivate_entity(struct bfq_entity *entity,
-				    bool ins_into_idle_tree)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	int is_in_service = entity == sd->in_service_entity;
-
-	if (!entity->on_st) /* entity never activated, or already inactive */
-		return false;
-
-	if (is_in_service)
-		bfq_calc_finish(entity, entity->service);
-
-	if (entity->tree == &st->active)
-		bfq_active_extract(st, entity);
-	else if (!is_in_service && entity->tree == &st->idle)
-		bfq_idle_extract(st, entity);
-
-	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
-		bfq_forget_entity(st, entity, is_in_service);
-	else
-		bfq_idle_insert(st, entity);
-
-	return true;
-}
-
-/**
- * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
- * @entity: the entity to deactivate.
- * @ins_into_idle_tree: true if the entity can be put on the idle tree
- */
-static void bfq_deactivate_entity(struct bfq_entity *entity,
-				  bool ins_into_idle_tree,
-				  bool expiration)
-{
-	struct bfq_sched_data *sd;
-	struct bfq_entity *parent = NULL;
-
-	for_each_entity_safe(entity, parent) {
-		sd = entity->sched_data;
-
-		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
-			/*
-			 * entity is not in any tree any more, so
-			 * this deactivation is a no-op, and there is
-			 * nothing to change for upper-level entities
-			 * (in case of expiration, this can never
-			 * happen).
-			 */
-			return;
-		}
-
-		if (sd->next_in_service == entity)
-			/*
-			 * entity was the next_in_service entity,
-			 * then, since entity has just been
-			 * deactivated, a new one must be found.
-			 */
-			bfq_update_next_in_service(sd, NULL);
-
-		if (sd->next_in_service)
-			/*
-			 * The parent entity is still backlogged,
-			 * because next_in_service is not NULL. So, no
-			 * further upwards deactivation must be
-			 * performed.  Yet, next_in_service has
-			 * changed.  Then the schedule does need to be
-			 * updated upwards.
-			 */
-			break;
-
-		/*
-		 * If we get here, then the parent is no more
-		 * backlogged and we need to propagate the
-		 * deactivation upwards. Thus let the loop go on.
-		 */
-
-		/*
-		 * Also let parent be queued into the idle tree on
-		 * deactivation, to preserve service guarantees, and
-		 * assuming that who invoked this function does not
-		 * need parent entities too to be removed completely.
-		 */
-		ins_into_idle_tree = true;
-	}
-
-	/*
-	 * If the deactivation loop is fully executed, then there are
-	 * no more entities to touch and next loop is not executed at
-	 * all. Otherwise, requeue remaining entities if they are
-	 * about to stop receiving service, or reposition them if this
-	 * is not the case.
-	 */
-	entity = parent;
-	for_each_entity(entity) {
-		/*
-		 * Invoke __bfq_requeue_entity on entity, even if
-		 * already active, to requeue/reposition it in the
-		 * active tree (because sd->next_in_service has
-		 * changed)
-		 */
-		__bfq_requeue_entity(entity);
-
-		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd, entity) &&
-		    !expiration)
-			/*
-			 * next_in_service unchanged or not causing
-			 * any change in entity->parent->sd, and no
-			 * requeueing needed for expiration: stop
-			 * here.
-			 */
-			break;
-	}
-}
-
-/**
- * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
- *                       if needed, to have at least one entity eligible.
- * @st: the service tree to act upon.
- *
- * Assumes that st is not empty.
- */
-static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
-{
-	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
-
-	if (bfq_gt(root_entity->min_start, st->vtime))
-		return root_entity->min_start;
-
-	return st->vtime;
-}
-
-static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
-{
-	if (new_value > st->vtime) {
-		st->vtime = new_value;
-		bfq_forget_idle(st);
-	}
-}
-
-/**
- * bfq_first_active_entity - find the eligible entity with
- *                           the smallest finish time
- * @st: the service tree to select from.
- * @vtime: the system virtual to use as a reference for eligibility
- *
- * This function searches the first schedulable entity, starting from the
- * root of the tree and going on the left every time on this side there is
- * a subtree with at least one eligible (start >= vtime) entity. The path on
- * the right is followed only if a) the left subtree contains no eligible
- * entities and b) no eligible entity has been found yet.
- */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
-						  u64 vtime)
-{
-	struct bfq_entity *entry, *first = NULL;
-	struct rb_node *node = st->active.rb_node;
-
-	while (node) {
-		entry = rb_entry(node, struct bfq_entity, rb_node);
-left:
-		if (!bfq_gt(entry->start, vtime))
-			first = entry;
-
-		if (node->rb_left) {
-			entry = rb_entry(node->rb_left,
-					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, vtime)) {
-				node = node->rb_left;
-				goto left;
-			}
-		}
-		if (first)
-			break;
-		node = node->rb_right;
-	}
-
-	return first;
-}
-
-/**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
- *
- * If there is no in-service entity for the sched_data st belongs to,
- * then return the entity that will be set in service if:
- * 1) the parent entity this st belongs to is set in service;
- * 2) no entity belonging to such parent entity undergoes a state change
- * that would influence the timestamps of the entity (e.g., becomes idle,
- * becomes backlogged, changes its budget, ...).
- *
- * In this first case, update the virtual time in @st too (see the
- * comments on this update inside the function).
- *
- * In constrast, if there is an in-service entity, then return the
- * entity that would be set in service if not only the above
- * conditions, but also the next one held true: the currently
- * in-service entity, on expiration,
- * 1) gets a finish time equal to the current one, or
- * 2) is not eligible any more, or
- * 3) is idle.
- */
-static struct bfq_entity *
-__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
-{
-	struct bfq_entity *entity;
-	u64 new_vtime;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
-
-	/*
-	 * Get the value of the system virtual time for which at
-	 * least one entity is eligible.
-	 */
-	new_vtime = bfq_calc_vtime_jump(st);
-
-	/*
-	 * If there is no in-service entity for the sched_data this
-	 * active tree belongs to, then push the system virtual time
-	 * up to the value that guarantees that at least one entity is
-	 * eligible. If, instead, there is an in-service entity, then
-	 * do not make any such update, because there is already an
-	 * eligible entity, namely the in-service one (even if the
-	 * entity is not on st, because it was extracted when set in
-	 * service).
-	 */
-	if (!in_service)
-		bfq_update_vtime(st, new_vtime);
-
-	entity = bfq_first_active_entity(st, new_vtime);
-
-	return entity;
-}
-
-/**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- *
- * This function is invoked when there has been a change in the trees
- * for sd, and we need know what is the new next entity after this
- * change.
- */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
-{
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
-	struct bfq_entity *entity = NULL;
-	int class_idx = 0;
-
-	/*
-	 * Choose from idle class, if needed to guarantee a minimum
-	 * bandwidth to this class (and if there is some active entity
-	 * in idle class). This should also mitigate
-	 * priority-inversion problems in case a low priority task is
-	 * holding file system resources.
-	 */
-	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
-				   BFQ_CL_IDLE_TIMEOUT)) {
-		if (!RB_EMPTY_ROOT(&idle_class_st->active))
-			class_idx = BFQ_IOPRIO_CLASSES - 1;
-		/* About to be served if backlogged, or not yet backlogged */
-		sd->bfq_class_idle_last_service = jiffies;
-	}
-
-	/*
-	 * Find the next entity to serve for the highest-priority
-	 * class, unless the idle class needs to be served.
-	 */
-	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
-		entity = __bfq_lookup_next_entity(st + class_idx,
-						  sd->in_service_entity);
-
-		if (entity)
-			break;
-	}
-
-	if (!entity)
-		return NULL;
-
-	return entity;
-}
-
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
-{
-	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
-
-	return sd->next_in_service != sd->in_service_entity;
-}
-
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
-{
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
-
-	if (bfqd->busy_queues == 0)
-		return NULL;
-
-	/*
-	 * Traverse the path from the root to the leaf entity to
-	 * serve. Set in service all the entities visited along the
-	 * way.
-	 */
-	sd = &bfqd->root_group->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
-		/*
-		 * WARNING. We are about to set the in-service entity
-		 * to sd->next_in_service, i.e., to the (cached) value
-		 * returned by bfq_lookup_next_entity(sd) the last
-		 * time it was invoked, i.e., the last time when the
-		 * service order in sd changed as a consequence of the
-		 * activation or deactivation of an entity. In this
-		 * respect, if we execute bfq_lookup_next_entity(sd)
-		 * in this very moment, it may, although with low
-		 * probability, yield a different entity than that
-		 * pointed to by sd->next_in_service. This rare event
-		 * happens in case there was no CLASS_IDLE entity to
-		 * serve for sd when bfq_lookup_next_entity(sd) was
-		 * invoked for the last time, while there is now one
-		 * such entity.
-		 *
-		 * If the above event happens, then the scheduling of
-		 * such entity in CLASS_IDLE is postponed until the
-		 * service of the sd->next_in_service entity
-		 * finishes. In fact, when the latter is expired,
-		 * bfq_lookup_next_entity(sd) gets called again,
-		 * exactly to update sd->next_in_service.
-		 */
-
-		/* Make next_in_service entity become in_service_entity */
-		entity = sd->next_in_service;
-		sd->in_service_entity = entity;
-
-		/*
-		 * Reset the accumulator of the amount of service that
-		 * the entity is about to receive.
-		 */
-		entity->service = 0;
-
-		/*
-		 * If entity is no longer a candidate for next
-		 * service, then we extract it from its active tree,
-		 * for the following reason. To further boost the
-		 * throughput in some special case, BFQ needs to know
-		 * which is the next candidate entity to serve, while
-		 * there is already an entity in service. In this
-		 * respect, to make it easy to compute/update the next
-		 * candidate entity to serve after the current
-		 * candidate has been set in service, there is a case
-		 * where it is necessary to extract the current
-		 * candidate from its service tree. Such a case is
-		 * when the entity just set in service cannot be also
-		 * a candidate for next service. Details about when
-		 * this conditions holds are reported in the comments
-		 * on the function bfq_no_longer_next_in_service()
-		 * invoked below.
-		 */
-		if (bfq_no_longer_next_in_service(entity))
-			bfq_active_extract(bfq_entity_service_tree(entity),
-					   entity);
-
-		/*
-		 * For the same reason why we may have just extracted
-		 * entity from its active tree, we may need to update
-		 * next_in_service for the sched_data of entity too,
-		 * regardless of whether entity has been extracted.
-		 * In fact, even if entity has not been extracted, a
-		 * descendant entity may get extracted. Such an event
-		 * would cause a change in next_in_service for the
-		 * level of the descendant entity, and thus possibly
-		 * back to upper levels.
-		 *
-		 * We cannot perform the resulting needed update
-		 * before the end of this loop, because, to know which
-		 * is the correct next-to-serve candidate entity for
-		 * each level, we need first to find the leaf entity
-		 * to set in service. In fact, only after we know
-		 * which is the next-to-serve leaf entity, we can
-		 * discover whether the parent entity of the leaf
-		 * entity becomes the next-to-serve, and so on.
-		 */
-
-	}
-
-	bfqq = bfq_entity_to_bfqq(entity);
-
-	/*
-	 * We can finally update all next-to-serve entities along the
-	 * path from the leaf entity just set in service to the root.
-	 */
-	for_each_entity(entity) {
-		struct bfq_sched_data *sd = entity->sched_data;
-
-		if (!bfq_update_next_in_service(sd, NULL))
-			break;
-	}
-
-	return bfqq;
-}
-
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
-{
-	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
-	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
-	struct bfq_entity *entity = in_serv_entity;
-
-	bfq_clear_bfqq_wait_request(in_serv_bfqq);
-	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-	bfqd->in_service_queue = NULL;
-
-	/*
-	 * When this function is called, all in-service entities have
-	 * been properly deactivated or requeued, so we can safely
-	 * execute the final step: reset in_service_entity along the
-	 * path from entity to the root.
-	 */
-	for_each_entity(entity)
-		entity->sched_data->in_service_entity = NULL;
-
-	/*
-	 * in_serv_entity is no longer in service, so, if it is in no
-	 * service tree either, then release the service reference to
-	 * the queue it represents (taken with bfq_get_entity).
-	 */
-	if (!in_serv_entity->on_st)
-		bfq_put_queue(in_serv_bfqq);
-}
-
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				bool ins_into_idle_tree, bool expiration)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
-}
-
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
-				    false);
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
-}
-
-static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_activate_requeue_entity(entity, false,
-				    bfqq == bfqd->in_service_queue);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree. As a special case, it can be invoked during an
- * expiration.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      bool expiration)
-{
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
-
-	bfq_clear_bfqq_busy(bfqq);
-
-	bfqd->busy_queues--;
-
-	if (!bfqq->dispatched)
-		bfq_weights_tree_remove(bfqd, &bfqq->entity,
-					&bfqd->queue_weights_tree);
-
-	if (bfqq->wr_coeff > 1)
-		bfqd->wr_busy_queues--;
-
-	bfqg_stats_update_dequeue(bfqq_group(bfqq));
-
-	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
-}
-
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
-
-	bfq_activate_bfqq(bfqd, bfqq);
-
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
-
-	if (!bfqq->dispatched)
-		if (bfqq->wr_coeff == 1)
-			bfq_weights_tree_add(bfqd, &bfqq->entity,
-					     &bfqd->queue_weights_tree);
-
-	if (bfqq->wr_coeff > 1)
-		bfqd->wr_busy_queues++;
-}
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-
-/* bfqg stats flags */
-enum bfqg_stats_flags {
-	BFQG_stats_waiting = 0,
-	BFQG_stats_idling,
-	BFQG_stats_empty,
-};
-
-#define BFQG_FLAG_FNS(name)						\
-static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags |= (1 << BFQG_stats_##name);			\
-}									\
-static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
-{									\
-	stats->flags &= ~(1 << BFQG_stats_##name);			\
-}									\
-static int bfqg_stats_##name(struct bfqg_stats *stats)		\
-{									\
-	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
-}									\
-
-BFQG_FLAG_FNS(waiting)
-BFQG_FLAG_FNS(idling)
-BFQG_FLAG_FNS(empty)
-#undef BFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_waiting(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
-		blkg_stat_add(&stats->group_wait_time,
-			      now - stats->start_group_wait_time);
-	bfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
-						 struct bfq_group *curr_bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_waiting(stats))
-		return;
-	if (bfqg == curr_bfqg)
-		return;
-	stats->start_group_wait_time = sched_clock();
-	bfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
-{
-	unsigned long long now;
-
-	if (!bfqg_stats_empty(stats))
-		return;
-
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
-		blkg_stat_add(&stats->empty_time,
-			      now - stats->start_empty_time);
-	bfqg_stats_clear_empty(stats);
-}
-
-static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
-{
-	blkg_stat_add(&bfqg->stats.dequeue, 1);
-}
-
-static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (blkg_rwstat_total(&stats->queued))
-		return;
-
-	/*
-	 * group is already marked empty. This can happen if bfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if (bfqg_stats_empty(stats))
-		return;
-
-	stats->start_empty_time = sched_clock();
-	bfqg_stats_mark_empty(stats);
-}
-
-static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
-
-		if (time_after64(now, stats->start_idle_time))
-			blkg_stat_add(&stats->idle_time,
-				      now - stats->start_idle_time);
-		bfqg_stats_clear_idling(stats);
-	}
-}
-
-static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	stats->start_idle_time = sched_clock();
-	bfqg_stats_mark_idling(stats);
-}
-
-static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-
-	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_total(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	bfqg_stats_update_group_wait_time(stats);
-}
-
-/*
- * blk-cgroup policy-related handlers
- * The following functions help in converting between blk-cgroup
- * internal structures and BFQ-specific structures.
- */
-
-static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
-}
-
-static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
-{
-	return pd_to_blkg(&bfqg->pd);
-}
-
-static struct blkcg_policy blkcg_policy_bfq;
-
-static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
-{
-	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
-}
-
-/*
- * bfq_group handlers
- * The following functions help in navigating the bfq_group hierarchy
- * by allowing to find the parent of a bfq_group or the bfq_group
- * associated to a bfq_queue.
- */
-
-static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
-{
-	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
-
-	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *group_entity = bfqq->entity.parent;
-
-	return group_entity ? container_of(group_entity, struct bfq_group,
-					   entity) :
-			      bfqq->bfqd->root_group;
-}
-
-/*
- * The following two functions handle get and put of a bfq_group by
- * wrapping the related blk-cgroup hooks.
- */
-
-static void bfqg_get(struct bfq_group *bfqg)
-{
-	return blkg_get(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_put(struct bfq_group *bfqg)
-{
-	return blkg_put(bfqg_to_blkg(bfqg));
-}
-
-static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
-				     struct bfq_queue *bfqq,
-				     unsigned int op)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
-	bfqg_stats_end_empty_time(&bfqg->stats);
-	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
-		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
-}
-
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
-}
-
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
-{
-	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
-}
-
-static void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op)
-{
-	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
-
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op,
-				now - io_start_time);
-	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
-}
-
-/* @stats = 0 */
-static void bfqg_stats_reset(struct bfqg_stats *stats)
-{
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->merged);
-	blkg_rwstat_reset(&stats->service_time);
-	blkg_rwstat_reset(&stats->wait_time);
-	blkg_stat_reset(&stats->time);
-	blkg_stat_reset(&stats->avg_queue_size_sum);
-	blkg_stat_reset(&stats->avg_queue_size_samples);
-	blkg_stat_reset(&stats->dequeue);
-	blkg_stat_reset(&stats->group_wait_time);
-	blkg_stat_reset(&stats->idle_time);
-	blkg_stat_reset(&stats->empty_time);
-}
-
-/* @to += @from */
-static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
-{
-	if (!to || !from)
-		return;
-
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->merged, &from->merged);
-	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
-	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
-	blkg_stat_add_aux(&from->time, &from->time);
-	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_add_aux(&to->avg_queue_size_samples,
-			  &from->avg_queue_size_samples);
-	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
-	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
-	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
-}
-
-/*
- * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
- * recursive stats can still account for the amount used by this bfqg after
- * it's gone.
- */
-static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
-{
-	struct bfq_group *parent;
-
-	if (!bfqg) /* root_group */
-		return;
-
-	parent = bfqg_parent(bfqg);
-
-	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
-
-	if (unlikely(!parent))
-		return;
-
-	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
-	bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
-		bfqg_get(bfqg);
-	}
-	entity->parent = bfqg->my_entity; /* NULL for root group */
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
-	blkg_rwstat_exit(&stats->merged);
-	blkg_rwstat_exit(&stats->service_time);
-	blkg_rwstat_exit(&stats->wait_time);
-	blkg_rwstat_exit(&stats->queued);
-	blkg_stat_exit(&stats->time);
-	blkg_stat_exit(&stats->avg_queue_size_sum);
-	blkg_stat_exit(&stats->avg_queue_size_samples);
-	blkg_stat_exit(&stats->dequeue);
-	blkg_stat_exit(&stats->group_wait_time);
-	blkg_stat_exit(&stats->idle_time);
-	blkg_stat_exit(&stats->empty_time);
-}
-
-static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
-{
-	if (blkg_rwstat_init(&stats->merged, gfp) ||
-	    blkg_rwstat_init(&stats->service_time, gfp) ||
-	    blkg_rwstat_init(&stats->wait_time, gfp) ||
-	    blkg_rwstat_init(&stats->queued, gfp) ||
-	    blkg_stat_init(&stats->time, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
-	    blkg_stat_init(&stats->dequeue, gfp) ||
-	    blkg_stat_init(&stats->group_wait_time, gfp) ||
-	    blkg_stat_init(&stats->idle_time, gfp) ||
-	    blkg_stat_init(&stats->empty_time, gfp)) {
-		bfqg_stats_exit(stats);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
-{
-	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
-}
-
-static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
-{
-	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
-}
-
-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
-{
-	struct bfq_group_data *bgd;
-
-	bgd = kzalloc(sizeof(*bgd), gfp);
-	if (!bgd)
-		return NULL;
-	return &bgd->pd;
-}
-
-static void bfq_cpd_init(struct blkcg_policy_data *cpd)
-{
-	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
-
-	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
-		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
-}
-
-static void bfq_cpd_free(struct blkcg_policy_data *cpd)
-{
-	kfree(cpd_to_bfqgd(cpd));
-}
-
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
-{
-	struct bfq_group *bfqg;
-
-	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
-	if (!bfqg)
-		return NULL;
-
-	if (bfqg_stats_init(&bfqg->stats, gfp)) {
-		kfree(bfqg);
-		return NULL;
-	}
-
-	return &bfqg->pd;
-}
-
-static void bfq_pd_init(struct blkg_policy_data *pd)
-{
-	struct blkcg_gq *blkg = pd_to_blkg(pd);
-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
-	struct bfq_entity *entity = &bfqg->entity;
-	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
-
-	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
-	entity->my_sched_data = &bfqg->sched_data;
-	bfqg->my_entity = entity; /*
-				   * the root_group's will be set to NULL
-				   * in bfq_init_queue()
-				   */
-	bfqg->bfqd = bfqd;
-	bfqg->active_entities = 0;
-	bfqg->rq_pos_tree = RB_ROOT;
-}
-
-static void bfq_pd_free(struct blkg_policy_data *pd)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-
-	bfqg_stats_exit(&bfqg->stats);
-	return kfree(bfqg);
-}
-
-static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-
-	bfqg_stats_reset(&bfqg->stats);
-}
-
-static void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
-{
-	struct bfq_entity *entity;
-
-	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
-}
-
-static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
-					 struct blkcg *blkcg)
-{
-	struct blkcg_gq *blkg;
-
-	blkg = blkg_lookup(blkcg, bfqd->queue);
-	if (likely(blkg))
-		return blkg_to_bfqg(blkg);
-	return NULL;
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
-					    struct blkcg *blkcg)
-{
-	struct bfq_group *bfqg, *parent;
-	struct bfq_entity *entity;
-
-	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
-
-	if (unlikely(!bfqg))
-		return NULL;
-
-	/*
-	 * Update chain of bfq_groups as we might be handling a leaf group
-	 * which, along with some of its relatives, has not been hooked yet
-	 * to the private hierarchy of BFQ.
-	 */
-	entity = &bfqg->entity;
-	for_each_entity(entity) {
-		bfqg = container_of(entity, struct bfq_group, entity);
-		if (bfqg != bfqd->root_group) {
-			parent = bfqg_parent(bfqg);
-			if (!parent)
-				parent = bfqd->root_group;
-			bfq_group_set_parent(bfqg, parent);
-		}
-	}
-
-	return bfqg;
-}
-
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
-				  struct bfq_queue *bfqq);
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason);
-
-/**
- * bfq_bfqq_move - migrate @bfqq to @bfqg.
- * @bfqd: queue descriptor.
- * @bfqq: the queue to move.
- * @bfqg: the group to move to.
- *
- * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
- * it on the new one.  Avoid putting the entity on the old group idle tree.
- *
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
- */
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_group *bfqg)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	/* If bfqq is empty, then bfq_bfqq_expire also invokes
-	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
-	 * from data structures related to current group. Otherwise we
-	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
-	 * we do below.
-	 */
-	if (bfqq == bfqd->in_service_queue)
-		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
-				false, BFQQE_PREEMPTED);
-
-	if (bfq_bfqq_busy(bfqq))
-		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
-	else if (entity->on_st)
-		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_put(bfqq_group(bfqq));
-
-	/*
-	 * Here we use a reference to bfqg.  We don't need a refcounter
-	 * as the cgroup reference will not be dropped, so that its
-	 * destroy() callback will not be invoked.
-	 */
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
-	bfqg_get(bfqg);
-
-	if (bfq_bfqq_busy(bfqq)) {
-		bfq_pos_tree_add_move(bfqd, bfqq);
-		bfq_activate_bfqq(bfqd, bfqq);
-	}
-
-	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
-		bfq_schedule_dispatch(bfqd);
-}
-
-/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
- * @bfqd: the queue descriptor.
- * @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
- *
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
- *
- * NOTE: an alternative approach might have been to store the current
- * cgroup in bfqq and getting a reference to it, reducing the lookup
- * time here, at the price of slightly more complex code.
- */
-static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
-						struct bfq_io_cq *bic,
-						struct blkcg *blkcg)
-{
-	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
-	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
-	struct bfq_group *bfqg;
-	struct bfq_entity *entity;
-
-	bfqg = bfq_find_set_group(bfqd, blkcg);
-
-	if (unlikely(!bfqg))
-		bfqg = bfqd->root_group;
-
-	if (async_bfqq) {
-		entity = &async_bfqq->entity;
-
-		if (entity->sched_data != &bfqg->sched_data) {
-			bic_set_bfqq(bic, NULL, 0);
-			bfq_log_bfqq(bfqd, async_bfqq,
-				     "bic_change_group: %p %d",
-				     async_bfqq, async_bfqq->ref);
-			bfq_put_queue(async_bfqq);
-		}
-	}
-
-	if (sync_bfqq) {
-		entity = &sync_bfqq->entity;
-		if (entity->sched_data != &bfqg->sched_data)
-			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
-	}
-
-	return bfqg;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
-{
-	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct bfq_group *bfqg = NULL;
-	uint64_t serial_nr;
-
-	rcu_read_lock();
-	serial_nr = bio_blkcg(bio)->css.serial_nr;
-
-	/*
-	 * Check whether blkcg has changed.  The condition may trigger
-	 * spuriously on a newly created cic but there's no harm.
-	 */
-	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
-		goto out;
-
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
-	bic->blkcg_serial_nr = serial_nr;
-out:
-	rcu_read_unlock();
-}
-
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_entities - move to the root group all active
- *                                entities.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
- */
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
-					 struct bfq_group *bfqg,
-					 struct bfq_service_tree *st)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity = NULL;
-
-	if (!RB_EMPTY_ROOT(&st->active))
-		entity = bfq_entity_of(rb_first(active));
-
-	for (; entity ; entity = bfq_entity_of(rb_first(active)))
-		bfq_reparent_leaf_entity(bfqd, entity);
-
-	if (bfqg->sched_data.in_service_entity)
-		bfq_reparent_leaf_entity(bfqd,
-			bfqg->sched_data.in_service_entity);
-}
-
-/**
- * bfq_pd_offline - deactivate the entity associated with @pd,
- *		    and reparent its children entities.
- * @pd: descriptor of the policy going offline.
- *
- * blkio already grabs the queue_lock for us, so no need to use
- * RCU-based magic
- */
-static void bfq_pd_offline(struct blkg_policy_data *pd)
-{
-	struct bfq_service_tree *st;
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-	struct bfq_data *bfqd = bfqg->bfqd;
-	struct bfq_entity *entity = bfqg->my_entity;
-	unsigned long flags;
-	int i;
-
-	if (!entity) /* root group */
-		return;
-
-	spin_lock_irqsave(&bfqd->lock, flags);
-	/*
-	 * Empty all service_trees belonging to this group before
-	 * deactivating the group itself.
-	 */
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
-		st = bfqg->sched_data.service_tree + i;
-
-		/*
-		 * The idle tree may still contain bfq_queues belonging
-		 * to exited task because they never migrated to a different
-		 * cgroup from the one being destroyed now.  No one else
-		 * can access them so it's safe to act without any lock.
-		 */
-		bfq_flush_idle_tree(st);
-
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * in service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfq_reparent_active_entities(bfqd, bfqg, st);
-	}
-
-	__bfq_deactivate_entity(entity, false);
-	bfq_put_async_queues(bfqd, bfqg);
-
-	spin_unlock_irqrestore(&bfqd->lock, flags);
-	/*
-	 * @blkg is going offline and will be ignored by
-	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
-	 * that they don't get lost.  If IOs complete after this point, the
-	 * stats for them will be lost.  Oh well...
-	 */
-	bfqg_stats_xfer_dead(bfqg);
-}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
-	struct blkcg_gq *blkg;
-
-	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
-		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-
-		bfq_end_wr_async_queues(bfqd, bfqg);
-	}
-	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+#define BFQ_BFQQ_FNS(name)						\
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)			\
+{									\
+	__set_bit(BFQQF_##name, &(bfqq)->flags);			\
+}									\
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)			\
+{									\
+	__clear_bit(BFQQF_##name, &(bfqq)->flags);		\
+}									\
+int bfq_bfqq_##name(const struct bfq_queue *bfqq)			\
+{									\
+	return test_bit(BFQQF_##name, &(bfqq)->flags);		\
 }
 
-static int bfq_io_show_weight(struct seq_file *sf, void *v)
-{
-	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	unsigned int val = 0;
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS						\
 
-	if (bfqgd)
-		val = bfqgd->weight;
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 
-	seq_printf(sf, "%u\n", val);
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
 
-	return 0;
-}
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
 
-static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
-				    struct cftype *cftype,
-				    u64 val)
-{
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	struct blkcg_gq *blkg;
-	int ret = -ERANGE;
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
 
-	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
-		return ret;
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
 
-	ret = 0;
-	spin_lock_irq(&blkcg->lock);
-	bfqgd->weight = (unsigned short)val;
-	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
-		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
 
-		if (!bfqg)
-			continue;
-		/*
-		 * Setting the prio_changed flag of the entity
-		 * to 1 with new_weight == weight would re-set
-		 * the value of the weight to its ioprio mapping.
-		 * Set the flag only if necessary.
-		 */
-		if ((unsigned short)val != bfqg->entity.new_weight) {
-			bfqg->entity.new_weight = (unsigned short)val;
-			/*
-			 * Make sure that the above new value has been
-			 * stored in bfqg->entity.new_weight before
-			 * setting the prio_changed flag. In fact,
-			 * this flag may be read asynchronously (in
-			 * critical sections protected by a different
-			 * lock than that held here), and finding this
-			 * flag set may cause the execution of the code
-			 * for updating parameters whose value may
-			 * depend also on bfqg->entity.new_weight (in
-			 * __bfq_entity_update_weight_prio).
-			 * This barrier makes sure that the new value
-			 * of bfqg->entity.new_weight is correctly
-			 * seen in that code.
-			 */
-			smp_wmb();
-			bfqg->entity.prio_changed = 1;
-		}
-	}
-	spin_unlock_irq(&blkcg->lock);
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
 
-	return ret;
-}
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+const int bfq_timeout = HZ / 8;
 
-static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
-				 char *buf, size_t nbytes,
-				 loff_t off)
-{
-	u64 weight;
-	/* First unsigned long found in the file is used */
-	int ret = kstrtoull(strim(buf), 0, &weight);
+static struct kmem_cache *bfq_pool;
 
-	if (ret)
-		return ret;
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
 
-	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
-}
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
 
-static int bfqg_print_stat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
-	return 0;
-}
+#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
 
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
-	return 0;
-}
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES	32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
 
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
-					  &blkcg_policy_bfq, off);
-	return __blkg_prfill_u64(sf, pd, sum);
-}
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
 
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
-					struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
-							   &blkcg_policy_bfq,
-							   off);
-	return __blkg_prfill_rwstat(sf, pd, &sum);
-}
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
 
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, false);
-	return 0;
-}
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
 
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
-			  seq_cft(sf)->private, true);
-	return 0;
+	return bic->bfqq[is_sync];
 }
 
-static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
-			       int off)
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
 {
-	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
+	bic->bfqq[is_sync] = bfqq;
 }
 
-static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
-	return 0;
+	return bic->icq.q->elevator->elevator_data;
 }
 
-static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
-					 struct blkg_policy_data *pd, int off)
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 {
-	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
-					offsetof(struct blkcg_gq, stat_bytes));
-	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
+	/* bic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct bfq_io_cq, icq);
 }
 
-static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc,
+					struct request_queue *q)
 {
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
-			  false);
-	return 0;
-}
+	if (ioc) {
+		unsigned long flags;
+		struct bfq_io_cq *icq;
 
-static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
-	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
-	u64 v = 0;
+		spin_lock_irqsave(q->queue_lock, flags);
+		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+		spin_unlock_irqrestore(q->queue_lock, flags);
 
-	if (samples) {
-		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
-		v = div64_u64(v, samples);
+		return icq;
 	}
-	__blkg_prfill_u64(sf, pd, v);
-	return 0;
-}
-
-/* print avg_queue_size */
-static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
-			  0, false);
-	return 0;
-}
-
-static struct bfq_group *
-bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
-{
-	int ret;
-
-	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
-	if (ret)
-		return NULL;
 
-	return blkg_to_bfqg(bfqd->queue->root_blkg);
+	return NULL;
 }
 
-static struct cftype bfq_blkcg_legacy_files[] = {
-	{
-		.name = "bfq.weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = bfq_io_show_weight,
-		.write_u64 = bfq_io_set_weight_legacy,
-	},
-
-	/* statistics, covers only the tasks in the bfqg */
-	{
-		.name = "bfq.time",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.sectors",
-		.seq_show = bfqg_print_stat_sectors,
-	},
-	{
-		.name = "bfq.io_service_bytes",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_bytes,
-	},
-	{
-		.name = "bfq.io_serviced",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_ios,
-	},
-	{
-		.name = "bfq.io_service_time",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_wait_time",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_merged",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat,
-	},
-	{
-		.name = "bfq.io_queued",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat,
-	},
-
-	/* the same statictics which cover the bfqg and its descendants */
-	{
-		.name = "bfq.time_recursive",
-		.private = offsetof(struct bfq_group, stats.time),
-		.seq_show = bfqg_print_stat_recursive,
-	},
-	{
-		.name = "bfq.sectors_recursive",
-		.seq_show = bfqg_print_stat_sectors_recursive,
-	},
-	{
-		.name = "bfq.io_service_bytes_recursive",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_bytes_recursive,
-	},
-	{
-		.name = "bfq.io_serviced_recursive",
-		.private = (unsigned long)&blkcg_policy_bfq,
-		.seq_show = blkg_print_stat_ios_recursive,
-	},
-	{
-		.name = "bfq.io_service_time_recursive",
-		.private = offsetof(struct bfq_group, stats.service_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_wait_time_recursive",
-		.private = offsetof(struct bfq_group, stats.wait_time),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_merged_recursive",
-		.private = offsetof(struct bfq_group, stats.merged),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.io_queued_recursive",
-		.private = offsetof(struct bfq_group, stats.queued),
-		.seq_show = bfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "bfq.avg_queue_size",
-		.seq_show = bfqg_print_avg_queue_size,
-	},
-	{
-		.name = "bfq.group_wait_time",
-		.private = offsetof(struct bfq_group, stats.group_wait_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.idle_time",
-		.private = offsetof(struct bfq_group, stats.idle_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.empty_time",
-		.private = offsetof(struct bfq_group, stats.empty_time),
-		.seq_show = bfqg_print_stat,
-	},
-	{
-		.name = "bfq.dequeue",
-		.private = offsetof(struct bfq_group, stats.dequeue),
-		.seq_show = bfqg_print_stat,
-	},
-	{ }	/* terminate */
-};
-
-static struct cftype bfq_blkg_files[] = {
-	{
-		.name = "bfq.weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = bfq_io_show_weight,
-		.write = bfq_io_set_weight,
-	},
-	{} /* terminate */
-};
-
-#else	/* CONFIG_BFQ_GROUP_IOSCHED */
-
-static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
-			struct bfq_queue *bfqq, unsigned int op) { }
-static inline void
-bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
-static inline void
-bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op) { }
-static inline void
-bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
-				     struct bfq_group *curr_bfqg) { }
-static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
-static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
-static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_group *bfqg) {}
-
-static void bfq_init_entity(struct bfq_entity *entity,
-			    struct bfq_group *bfqg)
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+void bfq_schedule_dispatch(struct bfq_data *bfqd)
 {
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	if (bfqq) {
-		bfqq->ioprio = bfqq->new_ioprio;
-		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		blk_mq_run_hw_queues(bfqd->queue, true);
 	}
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
-
-static void bfq_end_wr_async(struct bfq_data *bfqd)
-{
-	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
-}
-
-static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
-					    struct blkcg *blkcg)
-{
-	return bfqd->root_group;
-}
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
-{
-	return bfqq->bfqd->root_group;
-}
-
-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
-						    int node)
-{
-	struct bfq_group *bfqg;
-	int i;
-
-	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (!bfqg)
-		return NULL;
-
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-
-	return bfqg;
 }
-#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -4002,7 +438,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 	return bfqq;
 }
 
-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct rb_node **p, *parent;
 	struct bfq_queue *__bfqq;
@@ -4091,9 +527,8 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
  * In most scenarios, the rate at which nodes are created/destroyed
  * should be low too.
  */
-static void bfq_weights_tree_add(struct bfq_data *bfqd,
-				 struct bfq_entity *entity,
-				 struct rb_root *root)
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+			  struct rb_root *root)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 
@@ -4145,9 +580,8 @@ static void bfq_weights_tree_add(struct bfq_data *bfqd,
  * See the comments to the function bfq_weights_tree_add() for considerations
  * about overhead.
  */
-static void bfq_weights_tree_remove(struct bfq_data *bfqd,
-				    struct bfq_entity *entity,
-				    struct rb_root *root)
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+			     struct rb_root *root)
 {
 	if (!entity->weight_counter)
 		return;
@@ -4564,11 +998,6 @@ static int bfq_min_budget(struct bfq_data *bfqd)
 		return bfqd->bfq_max_budget / 32;
 }
 
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason);
-
 /*
  * The next function, invoked after the input queue bfqq switches from
  * idle to busy, updates the budget of bfqq. The function also tells
@@ -5259,8 +1688,8 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 	bfqq->entity.prio_changed = 1;
 }
 
-static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
-				    struct bfq_group *bfqg)
+void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+			     struct bfq_group *bfqg)
 {
 	int i, j;
 
@@ -6479,10 +2908,10 @@ static unsigned long bfq_smallest_from_now(void)
  * former on a timeslice basis, without violating service domain
  * guarantees among the latter.
  */
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    bool compensate,
-			    enum bfqq_expiration reason)
+void bfq_bfqq_expire(struct bfq_data *bfqd,
+		     struct bfq_queue *bfqq,
+		     bool compensate,
+		     enum bfqq_expiration reason)
 {
 	bool slow;
 	unsigned long delta = 0;
@@ -7188,7 +3617,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  * Scheduler lock must be held here. Recall not to use bfqq after calling
  * this function on it.
  */
-static void bfq_put_queue(struct bfq_queue *bfqq)
+void bfq_put_queue(struct bfq_queue *bfqq)
 {
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	struct bfq_group *bfqg = bfqq_group(bfqq);
@@ -7329,6 +3758,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 	bfqq->entity.prio_changed = 1;
 }
 
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic);
+
 static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
@@ -8104,7 +4537,7 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
  * we reparent them to the root cgroup (i.e., the only one that will
  * exist for sure until all the requests on a device are gone).
  */
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	int i, j;
 
@@ -8520,24 +4953,6 @@ static struct elevator_type iosched_bfq_mq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static struct blkcg_policy blkcg_policy_bfq = {
-	.dfl_cftypes		= bfq_blkg_files,
-	.legacy_cftypes		= bfq_blkcg_legacy_files,
-
-	.cpd_alloc_fn		= bfq_cpd_alloc,
-	.cpd_init_fn		= bfq_cpd_init,
-	.cpd_bind_fn	        = bfq_cpd_init,
-	.cpd_free_fn		= bfq_cpd_free,
-
-	.pd_alloc_fn		= bfq_pd_alloc,
-	.pd_init_fn		= bfq_pd_init,
-	.pd_offline_fn		= bfq_pd_offline,
-	.pd_free_fn		= bfq_pd_free,
-	.pd_reset_stats_fn	= bfq_pd_reset_stats,
-};
-#endif
-
 static int __init bfq_init(void)
 {
 	int ret;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
new file mode 100644
index 0000000..4ce7915
--- /dev/null
+++ b/block/bfq-iosched.h
@@ -0,0 +1,942 @@
+/*
+ * Header file for the BFQ I/O scheduler: data structures and
+ * prototypes of interface functions among BFQ components.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
+
+#define BFQ_MIN_WEIGHT			1
+#define BFQ_MAX_WEIGHT			1000
+#define BFQ_WEIGHT_CONVERSION_COEFF	10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO	4
+
+#define BFQ_WEIGHT_LEGACY_DFL	100
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR	100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	/* tree for active entities (i.e., those backlogged) */
+	struct rb_root active;
+	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+	struct rb_root idle;
+
+	/* idle entity with minimum F_i */
+	struct bfq_entity *first_idle;
+	/* idle entity with maximum F_i */
+	struct bfq_entity *last_idle;
+
+	/* scheduler virtual time */
+	u64 vtime;
+	/* scheduler weight sum; active and idle entities contribute to it */
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	/* entity in service */
+	struct bfq_entity *in_service_entity;
+	/* head-of-line entity (see comments above) */
+	struct bfq_entity *next_in_service;
+	/* array of service trees, one per ioprio_class */
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+	/* last time CLASS_IDLE was served */
+	unsigned long bfq_class_idle_last_service;
+
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ */
+struct bfq_weight_counter {
+	unsigned int weight; /* weight of the entities this counter refers to */
+	unsigned int num_active; /* nr of active entities with this weight */
+	/*
+	 * Weights tree member (see bfq_data's @queue_weights_tree and
+	 * @group_weights_tree)
+	 */
+	struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+	/* service_tree member */
+	struct rb_node rb_node;
+	/* pointer to the weight counter associated with this entity */
+	struct bfq_weight_counter *weight_counter;
+
+	/*
+	 * Flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree) or is in service.
+	 */
+	bool on_st;
+
+	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
+	u64 start, finish;
+
+	/* tree the entity is enqueued into; %NULL if not on a tree */
+	struct rb_root *tree;
+
+	/*
+	 * minimum start time of the (active) subtree rooted at this
+	 * entity; used for O(log N) lookups into active trees
+	 */
+	u64 min_start;
+
+	/* amount of service received during the last service slot */
+	int service;
+
+	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+	int budget;
+
+	/* weight of the queue */
+	int weight;
+	/* next weight if a change is in progress */
+	int new_weight;
+
+	/* original weight, used to implement weight boosting */
+	int orig_weight;
+
+	/* parent entity, for hierarchical scheduling */
+	struct bfq_entity *parent;
+
+	/*
+	 * For non-leaf nodes in the hierarchy, the associated
+	 * scheduler queue, %NULL on leaf nodes.
+	 */
+	struct bfq_sched_data *my_sched_data;
+	/* the scheduler queue this entity belongs to */
+	struct bfq_sched_data *sched_data;
+
+	/* flag, set to request a weight, ioprio or ioprio_class change  */
+	int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+	/* completion time of the last request */
+	u64 last_end_request;
+
+	/* total process thinktime */
+	u64 ttime_total;
+	/* number of thinktime samples */
+	unsigned long ttime_samples;
+	/* average process thinktime */
+	u64 ttime_mean;
+};
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it  is  async or shared  between  cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+	/* reference counter */
+	int ref;
+	/* parent bfq_data */
+	struct bfq_data *bfqd;
+
+	/* current ioprio and ioprio class */
+	unsigned short ioprio, ioprio_class;
+	/* next ioprio and ioprio class if a change is in progress */
+	unsigned short new_ioprio, new_ioprio_class;
+
+	/*
+	 * Shared bfq_queue if queue is cooperating with one or more
+	 * other queues.
+	 */
+	struct bfq_queue *new_bfqq;
+	/* request-position tree member (see bfq_group's @rq_pos_tree) */
+	struct rb_node pos_node;
+	/* request-position tree root (see bfq_group's @rq_pos_tree) */
+	struct rb_root *pos_root;
+
+	/* sorted list of pending requests */
+	struct rb_root sort_list;
+	/* if fifo isn't expired, next request to serve */
+	struct request *next_rq;
+	/* number of sync and async requests queued */
+	int queued[2];
+	/* number of requests currently allocated */
+	int allocated;
+	/* number of pending metadata requests */
+	int meta_pending;
+	/* fifo list of requests in sort_list */
+	struct list_head fifo;
+
+	/* entity representing this queue in the scheduler */
+	struct bfq_entity entity;
+
+	/* maximum budget allowed from the feedback mechanism */
+	int max_budget;
+	/* budget expiration (in jiffies) */
+	unsigned long budget_timeout;
+
+	/* number of requests on the dispatch list or inside driver */
+	int dispatched;
+
+	/* status flags */
+	unsigned long flags;
+
+	/* node for active/idle bfqq list inside parent bfqd */
+	struct list_head bfqq_list;
+
+	/* associated @bfq_ttime struct */
+	struct bfq_ttime ttime;
+
+	/* bit vector: a 1 for each seeky requests in history */
+	u32 seek_history;
+
+	/* node for the device's burst list */
+	struct hlist_node burst_list_node;
+
+	/* position of the last request enqueued */
+	sector_t last_request_pos;
+
+	/* Number of consecutive pairs of request completion and
+	 * arrival, such that the queue becomes idle after the
+	 * completion, but the next request arrives within an idle
+	 * time slice; used only if the queue's IO_bound flag has been
+	 * cleared.
+	 */
+	unsigned int requests_within_timer;
+
+	/* pid of the process owning the queue, used for logging purposes */
+	pid_t pid;
+
+	/*
+	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+	 * if the queue is shared.
+	 */
+	struct bfq_io_cq *bic;
+
+	/* current maximum weight-raising time for this queue */
+	unsigned long wr_cur_max_time;
+	/*
+	 * Minimum time instant such that, only if a new request is
+	 * enqueued after this time instant in an idle @bfq_queue with
+	 * no outstanding requests, then the task associated with the
+	 * queue it is deemed as soft real-time (see the comments on
+	 * the function bfq_bfqq_softrt_next_start())
+	 */
+	unsigned long soft_rt_next_start;
+	/*
+	 * Start time of the current weight-raising period if
+	 * the @bfq-queue is being weight-raised, otherwise
+	 * finish time of the last weight-raising period.
+	 */
+	unsigned long last_wr_start_finish;
+	/* factor by which the weight of this queue is multiplied */
+	unsigned int wr_coeff;
+	/*
+	 * Time of the last transition of the @bfq_queue from idle to
+	 * backlogged.
+	 */
+	unsigned long last_idle_bklogged;
+	/*
+	 * Cumulative service received from the @bfq_queue since the
+	 * last transition from idle to backlogged.
+	 */
+	unsigned long service_from_backlogged;
+
+	/*
+	 * Value of wr start time when switching to soft rt
+	 */
+	unsigned long wr_start_at_switch_to_srt;
+
+	unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+	/* associated io_cq structure */
+	struct io_cq icq; /* must be the first member */
+	/* array of two process queues, the sync and the async */
+	struct bfq_queue *bfqq[2];
+	/* per (request_queue, blkcg) ioprio */
+	int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+	/*
+	 * Snapshot of the idle window before merging; taken to
+	 * remember this value while the queue is merged, so as to be
+	 * able to restore it in case of split.
+	 */
+	bool saved_idle_window;
+	/*
+	 * Same purpose as the previous two fields for the I/O bound
+	 * classification of a queue.
+	 */
+	bool saved_IO_bound;
+
+	/*
+	 * Same purpose as the previous fields for the value of the
+	 * field keeping the queue's belonging to a large burst
+	 */
+	bool saved_in_large_burst;
+	/*
+	 * True if the queue belonged to a burst list before its merge
+	 * with another cooperating queue.
+	 */
+	bool was_in_burst_list;
+
+	/*
+	 * Similar to previous fields: save wr information.
+	 */
+	unsigned long saved_wr_coeff;
+	unsigned long saved_last_wr_start_finish;
+	unsigned long saved_wr_start_at_switch_to_srt;
+	unsigned int saved_wr_cur_max_time;
+	struct bfq_ttime saved_ttime;
+};
+
+enum bfq_device_speed {
+	BFQ_BFQD_FAST,
+	BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by @lock.
+ */
+struct bfq_data {
+	/* device request queue */
+	struct request_queue *queue;
+	/* dispatch queue */
+	struct list_head dispatch;
+
+	/* root bfq_group for the device */
+	struct bfq_group *root_group;
+
+	/*
+	 * rbtree of weight counters of @bfq_queues, sorted by
+	 * weight. Used to keep track of whether all @bfq_queues have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active and not
+	 * weight-raised @bfq_queue (see the comments to the functions
+	 * bfq_weights_tree_[add|remove] for further details).
+	 */
+	struct rb_root queue_weights_tree;
+	/*
+	 * rbtree of non-queue @bfq_entity weight counters, sorted by
+	 * weight. Used to keep track of whether all @bfq_groups have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active @bfq_group (see
+	 * the comments to the functions bfq_weights_tree_[add|remove]
+	 * for further details).
+	 */
+	struct rb_root group_weights_tree;
+
+	/*
+	 * Number of bfq_queues containing requests (including the
+	 * queue in service, even if it is idling).
+	 */
+	int busy_queues;
+	/* number of weight-raised busy @bfq_queues */
+	int wr_busy_queues;
+	/* number of queued requests */
+	int queued;
+	/* number of requests dispatched and waiting for completion */
+	int rq_in_driver;
+
+	/*
+	 * Maximum number of requests in driver in the last
+	 * @hw_tag_samples completed requests.
+	 */
+	int max_rq_in_driver;
+	/* number of samples used to calculate hw_tag */
+	int hw_tag_samples;
+	/* flag set to one if the driver is showing a queueing behavior */
+	int hw_tag;
+
+	/* number of budgets assigned */
+	int budgets_assigned;
+
+	/*
+	 * Timer set when idling (waiting) for the next request from
+	 * the queue in service.
+	 */
+	struct hrtimer idle_slice_timer;
+
+	/* bfq_queue in service */
+	struct bfq_queue *in_service_queue;
+
+	/* on-disk position of the last served request */
+	sector_t last_position;
+
+	/* time of last request completion (ns) */
+	u64 last_completion;
+
+	/* time of first rq dispatch in current observation interval (ns) */
+	u64 first_dispatch;
+	/* time of last rq dispatch in current observation interval (ns) */
+	u64 last_dispatch;
+
+	/* beginning of the last budget */
+	ktime_t last_budget_start;
+	/* beginning of the last idle slice */
+	ktime_t last_idling_start;
+
+	/* number of samples in current observation interval */
+	int peak_rate_samples;
+	/* num of samples of seq dispatches in current observation interval */
+	u32 sequential_samples;
+	/* total num of sectors transferred in current observation interval */
+	u64 tot_sectors_dispatched;
+	/* max rq size seen during current observation interval (sectors) */
+	u32 last_rq_max_size;
+	/* time elapsed from first dispatch in current observ. interval (us) */
+	u64 delta_from_first;
+	/*
+	 * Current estimate of the device peak rate, measured in
+	 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+	 * BFQ_RATE_SHIFT is performed to increase precision in
+	 * fixed-point calculations.
+	 */
+	u32 peak_rate;
+
+	/* maximum budget allotted to a bfq_queue before rescheduling */
+	int bfq_max_budget;
+
+	/* list of all the bfq_queues active on the device */
+	struct list_head active_list;
+	/* list of all the bfq_queues idle on the device */
+	struct list_head idle_list;
+
+	/*
+	 * Timeout for async/sync requests; when it fires, requests
+	 * are served in fifo order.
+	 */
+	u64 bfq_fifo_expire[2];
+	/* weight of backward seeks wrt forward ones */
+	unsigned int bfq_back_penalty;
+	/* maximum allowed backward seek */
+	unsigned int bfq_back_max;
+	/* maximum idling time */
+	u32 bfq_slice_idle;
+
+	/* user-configured max budget value (0 for auto-tuning) */
+	int bfq_user_max_budget;
+	/*
+	 * Timeout for bfq_queues to consume their budget; used to
+	 * prevent seeky queues from imposing long latencies to
+	 * sequential or quasi-sequential ones (this also implies that
+	 * seeky queues cannot receive guarantees in the service
+	 * domain; after a timeout they are charged for the time they
+	 * have been in service, to preserve fairness among them, but
+	 * without service-domain guarantees).
+	 */
+	unsigned int bfq_timeout;
+
+	/*
+	 * Number of consecutive requests that must be issued within
+	 * the idle time slice to set again idling to a queue which
+	 * was marked as non-I/O-bound (see the definition of the
+	 * IO_bound flag for further details).
+	 */
+	unsigned int bfq_requests_within_timer;
+
+	/*
+	 * Force device idling whenever needed to provide accurate
+	 * service guarantees, without caring about throughput
+	 * issues. CAVEAT: this may even increase latencies, in case
+	 * of useless idling for processes that did stop doing I/O.
+	 */
+	bool strict_guarantees;
+
+	/*
+	 * Last time at which a queue entered the current burst of
+	 * queues being activated shortly after each other; for more
+	 * details about this and the following parameters related to
+	 * a burst of activations, see the comments on the function
+	 * bfq_handle_burst.
+	 */
+	unsigned long last_ins_in_burst;
+	/*
+	 * Reference time interval used to decide whether a queue has
+	 * been activated shortly after @last_ins_in_burst.
+	 */
+	unsigned long bfq_burst_interval;
+	/* number of queues in the current burst of queue activations */
+	int burst_size;
+
+	/* common parent entity for the queues in the burst */
+	struct bfq_entity *burst_parent_entity;
+	/* Maximum burst size above which the current queue-activation
+	 * burst is deemed as 'large'.
+	 */
+	unsigned long bfq_large_burst_thresh;
+	/* true if a large queue-activation burst is in progress */
+	bool large_burst;
+	/*
+	 * Head of the burst list (as for the above fields, more
+	 * details in the comments on the function bfq_handle_burst).
+	 */
+	struct hlist_head burst_list;
+
+	/* if set to true, low-latency heuristics are enabled */
+	bool low_latency;
+	/*
+	 * Maximum factor by which the weight of a weight-raised queue
+	 * is multiplied.
+	 */
+	unsigned int bfq_wr_coeff;
+	/* maximum duration of a weight-raising period (jiffies) */
+	unsigned int bfq_wr_max_time;
+
+	/* Maximum weight-raising duration for soft real-time processes */
+	unsigned int bfq_wr_rt_max_time;
+	/*
+	 * Minimum idle period after which weight-raising may be
+	 * reactivated for a queue (in jiffies).
+	 */
+	unsigned int bfq_wr_min_idle_time;
+	/*
+	 * Minimum period between request arrivals after which
+	 * weight-raising may be reactivated for an already busy async
+	 * queue (in jiffies).
+	 */
+	unsigned long bfq_wr_min_inter_arr_async;
+
+	/* Max service-rate for a soft real-time queue, in sectors/sec */
+	unsigned int bfq_wr_max_softrt_rate;
+	/*
+	 * Cached value of the product R*T, used for computing the
+	 * maximum duration of weight raising automatically.
+	 */
+	u64 RT_prod;
+	/* device-speed class for the low-latency heuristic */
+	enum bfq_device_speed device_speed;
+
+	/* fallback dummy bfqq for extreme OOM conditions */
+	struct bfq_queue oom_bfqq;
+
+	spinlock_t lock;
+
+	/*
+	 * bic associated with the task issuing current bio for
+	 * merging. This and the next field are used as a support to
+	 * be able to perform the bic lookup, needed by bio-merge
+	 * functions, before the scheduler lock is taken, and thus
+	 * avoid taking the request-queue lock while the scheduler
+	 * lock is being held.
+	 */
+	struct bfq_io_cq *bio_bic;
+	/* bfqq associated with the task issuing current bio for merging */
+	struct bfq_queue *bio_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQQF_just_created = 0,	/* queue just allocated */
+	BFQQF_busy,		/* has requests or is in service */
+	BFQQF_wait_request,	/* waiting for a request */
+	BFQQF_non_blocking_wait_rq, /*
+				     * waiting for a request
+				     * without idling the device
+				     */
+	BFQQF_fifo_expire,	/* FIFO checked in this slice */
+	BFQQF_idle_window,	/* slice idling enabled */
+	BFQQF_sync,		/* synchronous queue */
+	BFQQF_IO_bound,		/*
+				 * bfqq has timed-out at least once
+				 * having consumed at most 2/10 of
+				 * its budget
+				 */
+	BFQQF_in_large_burst,	/*
+				 * bfqq activated in a large burst,
+				 * see comments to bfq_handle_burst.
+				 */
+	BFQQF_softrt_update,	/*
+				 * may need softrt-next-start
+				 * update
+				 */
+	BFQQF_coop,		/* bfqq is shared */
+	BFQQF_split_coop	/* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq);			\
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq);			\
+int bfq_bfqq_##name(const struct bfq_queue *bfqq);
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQQE_TOO_IDLE = 0,		/*
+					 * queue has been idling for
+					 * too long
+					 */
+	BFQQE_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQQE_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQQE_NO_MORE_REQUESTS,	/* the queue has no more requests */
+	BFQQE_PREEMPTED		/* preemption in progress */
+};
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @stats: stats for this bfqg.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	int active_entities;
+
+	struct rb_root rq_pos_tree;
+
+	struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+/* --------------- main algorithm interface ----------------- */
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+extern const int bfq_timeout;
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+			  struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+			     struct rb_root *root);
+void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		     bool compensate, enum bfqq_expiration reason);
+void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_schedule_dispatch(struct bfq_data *bfqd);
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+
+/* ------------ end of main algorithm interface -------------- */
+
+/* ---------------- cgroups-support interface ---------------- */
+
+extern struct cftype bfq_blkcg_legacy_files[];
+extern struct cftype bfq_blkg_files[];
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+			      unsigned int op);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+				  uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		   struct bfq_group *bfqg);
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
+void bfq_end_wr_async(struct bfq_data *bfqd);
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+				     struct blkcg *blkcg);
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_put(struct bfq_group *bfqg);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+extern struct blkcg_policy blkcg_policy_bfq;
+#endif
+
+/* ------------- end of cgroups-support interface ------------- */
+
+/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = entity->parent)
+
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
+#define for_each_entity_safe(entity, parent) \
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
+struct bfq_entity *bfq_entity_of(struct rb_node *node);
+unsigned short bfq_ioprio_to_weight(int ioprio);
+void bfq_put_idle_entity(struct bfq_service_tree *st,
+			 struct bfq_entity *entity);
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+				struct bfq_entity *entity);
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  unsigned long time_ms);
+bool __bfq_deactivate_entity(struct bfq_entity *entity,
+			     bool ins_into_idle_tree);
+bool next_queue_may_preempt(struct bfq_data *bfqd);
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			 bool ins_into_idle_tree, bool expiration);
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		       bool expiration);
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/* --------------- end of interface of B-WF2Q+ ---------------- */
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+#endif /* _BFQ_H */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
new file mode 100644
index 0000000..b4fc3e4
--- /dev/null
+++ b/block/bfq-wf2q.c
@@ -0,0 +1,1616 @@
+/*
+ * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
+ * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
+ * scheduler schedules generic entities. The latter can represent
+ * either single bfq queues (associated with processes) or groups of
+ * bfq queues (associated with cgroups).
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#include "bfq-iosched.h"
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+	struct rb_node *node = tree->rb_node;
+
+	return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq ? bfqq->ioprio_class - 1 :
+		BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *		requeueing or repositionig triggered the invocation of
+ *		this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
+ */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+				       struct bfq_entity *new_entity)
+{
+	struct bfq_entity *next_in_service = sd->next_in_service;
+	bool parent_sched_may_change = false;
+
+	/*
+	 * If this update is triggered by the activation, requeueing
+	 * or repositiong of an entity that does not coincide with
+	 * sd->next_in_service, then a full lookup in the active tree
+	 * can be avoided. In fact, it is enough to check whether the
+	 * just-modified entity has a higher priority than
+	 * sd->next_in_service, or, even if it has the same priority
+	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * finish time than sd->next_in_service. If this compound
+	 * condition holds, then the new entity becomes the new
+	 * next_in_service. Otherwise no change is needed.
+	 */
+	if (new_entity && new_entity != sd->next_in_service) {
+		/*
+		 * Flag used to decide whether to replace
+		 * sd->next_in_service with new_entity. Tentatively
+		 * set to true, and left as true if
+		 * sd->next_in_service is NULL.
+		 */
+		bool replace_next = true;
+
+		/*
+		 * If there is already a next_in_service candidate
+		 * entity, then compare class priorities or timestamps
+		 * to decide whether to replace sd->service_tree with
+		 * new_entity.
+		 */
+		if (next_in_service) {
+			unsigned int new_entity_class_idx =
+				bfq_class_idx(new_entity);
+			struct bfq_service_tree *st =
+				sd->service_tree + new_entity_class_idx;
+
+			/*
+			 * For efficiency, evaluate the most likely
+			 * sub-condition first.
+			 */
+			replace_next =
+				(new_entity_class_idx ==
+				 bfq_class_idx(next_in_service)
+				 &&
+				 !bfq_gt(new_entity->start, st->vtime)
+				 &&
+				 bfq_gt(next_in_service->finish,
+					new_entity->finish))
+				||
+				new_entity_class_idx <
+				bfq_class_idx(next_in_service);
+		}
+
+		if (replace_next)
+			next_in_service = new_entity;
+	} else /* invoked because of a deactivation: lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd);
+
+	if (next_in_service) {
+		parent_sched_may_change = !sd->next_in_service ||
+			bfq_update_parent_budget(next_in_service);
+	}
+
+	sd->next_in_service = next_in_service;
+
+	if (!next_in_service)
+		return parent_sched_may_change;
+
+	return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	if (!group_entity)
+		group_entity = &bfqq->bfqd->root_group->entity;
+
+	return container_of(group_entity, struct bfq_group, entity);
+}
+
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+	bool ret = false;
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity) {
+		if (bfqg_entity->budget > next_in_service->budget)
+			ret = true;
+		bfqg_entity->budget = next_in_service->budget;
+	}
+
+	return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+	struct bfq_group *bfqg;
+
+	if (bfq_entity_to_bfqq(entity))
+		return true;
+
+	bfqg = container_of(entity, struct bfq_group, entity);
+
+	if (bfqg->active_entities == 1)
+		return true;
+
+	return false;
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+	return false;
+}
+
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+	return true;
+}
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	if (!entity->my_sched_data)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			entity->start, entity->finish,
+			bfq_delta(service, entity->weight));
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (!parent)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left)
+		node = node->rb_left;
+	else if (node->rb_right)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else /* bfq_group */
+		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+
+	if (bfqg != bfqd->root_group)
+		bfqg->active_entities++;
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+	return max_t(int, 0,
+		     IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq) {
+		bfqq->ref++;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, bfqq->ref);
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (!node->rb_right && !node->rb_left)
+		deepest = rb_parent(node);
+	else if (!node->rb_right)
+		deepest = node->rb_left;
+	else if (!node->rb_left)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node)
+		bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else /* bfq_group */
+		bfq_weights_tree_remove(bfqd, entity,
+					&bfqd->group_weights_tree);
+
+	if (bfqg != bfqd->root_group)
+		bfqg->active_entities--;
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - do not consider entity any longer for scheduling
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ * @is_in_service: true if entity is currently the in-service entity.
+ *
+ * Forget everything about @entity. In addition, if entity represents
+ * a queue, and the latter is not in service, then release the service
+ * reference to the queue (the one taken through bfq_get_entity). In
+ * fact, in this case, there is really no more service reference to
+ * the queue, as the latter is also outside any service tree. If,
+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service
+ * will take care of putting the reference when the queue finally
+ * stops being served.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity,
+			      bool is_in_service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->on_st = false;
+	st->wsum -= entity->weight;
+	if (bfqq && !is_in_service)
+		bfq_put_queue(bfqq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity,
+			  entity == entity->sched_data->in_service_entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	unsigned int idx = bfq_class_idx(entity);
+
+	return sched_data->service_tree + idx;
+}
+
+
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+				struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->prio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		unsigned int prev_weight, new_weight;
+		struct bfq_data *bfqd = NULL;
+		struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
+
+		if (bfqq)
+			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+		}
+#endif
+
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			if (entity->new_weight < BFQ_MIN_WEIGHT ||
+			    entity->new_weight > BFQ_MAX_WEIGHT) {
+				pr_crit("update_weight_prio: new_weight %d\n",
+					entity->new_weight);
+				if (entity->new_weight < BFQ_MIN_WEIGHT)
+					entity->new_weight = BFQ_MIN_WEIGHT;
+				else
+					entity->new_weight = BFQ_MAX_WEIGHT;
+			}
+			entity->orig_weight = entity->new_weight;
+			if (bfqq)
+				bfqq->ioprio =
+				  bfq_weight_to_ioprio(entity->orig_weight);
+		}
+
+		if (bfqq)
+			bfqq->ioprio_class = bfqq->new_ioprio_class;
+		entity->prio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+
+		prev_weight = entity->weight;
+		new_weight = entity->orig_weight *
+			     (bfqq ? bfqq->wr_coeff : 1);
+		/*
+		 * If the weight of the entity changes, remove the entity
+		 * from its old weight counter (if there is a counter
+		 * associated with the entity), and add it to the counter
+		 * associated with its new weight.
+		 */
+		if (prev_weight != new_weight) {
+			root = bfqq ? &bfqd->queue_weights_tree :
+				      &bfqd->group_weights_tree;
+			bfq_weights_tree_remove(bfqd, entity, root);
+		}
+		entity->weight = new_weight;
+		/*
+		 * Add the entity to its weights tree only if it is
+		 * not associated with a weight-raised queue.
+		 */
+		if (prev_weight != new_weight &&
+		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+			/* If we get here, root has been initialized. */
+			bfq_weights_tree_add(bfqd, entity, root);
+
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ *			  of the time interval during which bfqq has been in
+ *			  service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  unsigned long time_ms)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	int tot_serv_to_charge = entity->service;
+	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+	if (time_ms > 0 && time_ms < timeout_ms)
+		tot_serv_to_charge =
+			(bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+	if (tot_serv_to_charge < entity->service)
+		tot_serv_to_charge = entity->service;
+
+	/* Increase budget to avoid inconsistencies */
+	if (tot_serv_to_charge > entity->budget)
+		entity->budget = tot_serv_to_charge;
+
+	bfq_bfqq_served(bfqq,
+			max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+					struct bfq_service_tree *st,
+					bool backshifted)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+
+	/*
+	 * If some queues enjoy backshifting for a while, then their
+	 * (virtual) finish timestamps may happen to become lower and
+	 * lower than the system virtual time.	In particular, if
+	 * these queues often happen to be idle for short time
+	 * periods, and during such time periods other queues with
+	 * higher timestamps happen to be busy, then the backshifted
+	 * timestamps of the former queues can become much lower than
+	 * the system virtual time. In fact, to serve the queues with
+	 * higher timestamps while the ones with lower timestamps are
+	 * idle, the system virtual time may be pushed-up to much
+	 * higher values than the finish timestamps of the idle
+	 * queues. As a consequence, the finish timestamps of all new
+	 * or newly activated queues may end up being much larger than
+	 * those of lucky queues with backshifted timestamps. The
+	 * latter queues may then monopolize the device for a lot of
+	 * time. This would simply break service guarantees.
+	 *
+	 * To reduce this problem, push up a little bit the
+	 * backshifted timestamps of the queue associated with this
+	 * entity (only a queue can happen to have the backshifted
+	 * flag set): just enough to let the finish timestamp of the
+	 * queue be equal to the current value of the system virtual
+	 * time. This may introduce a little unfairness among queues
+	 * with backshifted timestamps, but it does not break
+	 * worst-case fairness guarantees.
+	 *
+	 * As a special case, if bfqq is weight-raised, push up
+	 * timestamps much less, to keep very low the probability that
+	 * this push up causes the backshifted finish timestamps of
+	 * weight-raised queues to become higher than the backshifted
+	 * finish timestamps of non weight-raised queues.
+	 */
+	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+		unsigned long delta = st->vtime - entity->finish;
+
+		if (bfqq)
+			delta /= bfqq->wr_coeff;
+
+		entity->start += delta;
+		entity->finish += delta;
+	}
+
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool backshifted = false;
+	unsigned long long min_vstart;
+
+	/* See comments on bfq_fqq_update_budg_for_activation */
+	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+		backshifted = true;
+		min_vstart = entity->finish;
+	} else
+		min_vstart = st->vtime;
+
+	if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(min_vstart, entity->finish) ?
+			min_vstart : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = min_vstart;
+		st->wsum += entity->weight;
+		/*
+		 * entity is about to be inserted into a service tree,
+		 * and then set in service: get a reference to make
+		 * sure entity does not disappear until it is no
+		 * longer in service or scheduled for service.
+		 */
+		bfq_get_entity(entity);
+
+		entity->on_st = true;
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * We are requeueing the current in-service entity,
+		 * which may have to be done for one of the following
+		 * reasons:
+		 * - entity represents the in-service queue, and the
+		 *   in-service queue is being requeued after an
+		 *   expiration;
+		 * - entity represents a group, and its budget has
+		 *   changed because one of its child entities has
+		 *   just been either activated or requeued for some
+		 *   reason; the timestamps of the entity need then to
+		 *   be updated, and the entity needs to be enqueued
+		 *   or repositioned accordingly.
+		 *
+		 * In particular, before requeueing, the start time of
+		 * the entity must be moved forward to account for the
+		 * service that the entity has received while in
+		 * service. This is done by the next instructions. The
+		 * finish time will then be updated according to this
+		 * new value of the start time, and to the budget of
+		 * the entity.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		/*
+		 * In addition, if the entity had more than one child
+		 * when set in service, then was not extracted from
+		 * the active tree. This implies that the position of
+		 * the entity in the active tree may need to be
+		 * changed now, because we have just updated the start
+		 * time of the entity, and we will update its finish
+		 * time in a moment (the requeueing is then, more
+		 * precisely, a repositioning in this case). To
+		 * implement this repositioning, we: 1) dequeue the
+		 * entity here, 2) update the finish time and
+		 * requeue the entity according to the new
+		 * timestamps below.
+		 */
+		if (entity->tree)
+			bfq_active_extract(st, entity);
+	} else { /* The entity is already active, and not in service */
+		/*
+		 * In this case, this function gets called only if the
+		 * next_in_service entity below this entity has
+		 * changed, and this change has caused the budget of
+		 * this entity to change, which, finally implies that
+		 * the finish time of this entity must be
+		 * updated. Such an update may cause the scheduling,
+		 * i.e., the position in the active tree, of this
+		 * entity to change. We handle this change by: 1)
+		 * dequeueing the entity here, 2) updating the finish
+		 * time and requeueing the entity according to the new
+		 * timestamps below. This is the same approach as the
+		 * non-extracted-entity sub-case above.
+		 */
+		bfq_active_extract(st, entity);
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+					  struct bfq_sched_data *sd,
+					  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (sd->in_service_entity == entity || entity->tree == &st->active)
+		 /*
+		  * in service or already queued on the active tree,
+		  * requeue or reposition
+		  */
+		__bfq_requeue_entity(entity);
+	else
+		/*
+		 * Not in service and not queued on its active tree:
+		 * the activity is idle and this is a true activation.
+		 */
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *			 and activate, requeue or reposition all ancestors
+ *			 for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *	     being expired; thus ALL its ancestors stop being served and must
+ *	     therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+					bool non_blocking_wait_rq,
+					bool requeue)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		sd = entity->sched_data;
+		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+		if (!bfq_update_next_in_service(sd, entity) && !requeue)
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *			idle tree.
+ *
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
+ */
+bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	int is_in_service = entity == sd->in_service_entity;
+
+	if (!entity->on_st) /* entity never activated, or already inactive */
+		return false;
+
+	if (is_in_service)
+		bfq_calc_finish(entity, entity->service);
+
+	if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (!is_in_service && entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+
+	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity, is_in_service);
+	else
+		bfq_idle_insert(st, entity);
+
+	return true;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+				  bool ins_into_idle_tree,
+				  bool expiration)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent = NULL;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
+			/*
+			 * entity is not in any tree any more, so
+			 * this deactivation is a no-op, and there is
+			 * nothing to change for upper-level entities
+			 * (in case of expiration, this can never
+			 * happen).
+			 */
+			return;
+		}
+
+		if (sd->next_in_service == entity)
+			/*
+			 * entity was the next_in_service entity,
+			 * then, since entity has just been
+			 * deactivated, a new one must be found.
+			 */
+			bfq_update_next_in_service(sd, NULL);
+
+		if (sd->next_in_service)
+			/*
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL. So, no
+			 * further upwards deactivation must be
+			 * performed.  Yet, next_in_service has
+			 * changed.  Then the schedule does need to be
+			 * updated upwards.
+			 */
+			break;
+
+		/*
+		 * If we get here, then the parent is no more
+		 * backlogged and we need to propagate the
+		 * deactivation upwards. Thus let the loop go on.
+		 */
+
+		/*
+		 * Also let parent be queued into the idle tree on
+		 * deactivation, to preserve service guarantees, and
+		 * assuming that who invoked this function does not
+		 * need parent entities too to be removed completely.
+		 */
+		ins_into_idle_tree = true;
+	}
+
+	/*
+	 * If the deactivation loop is fully executed, then there are
+	 * no more entities to touch and next loop is not executed at
+	 * all. Otherwise, requeue remaining entities if they are
+	 * about to stop receiving service, or reposition them if this
+	 * is not the case.
+	 */
+	entity = parent;
+	for_each_entity(entity) {
+		/*
+		 * Invoke __bfq_requeue_entity on entity, even if
+		 * already active, to requeue/reposition it in the
+		 * active tree (because sd->next_in_service has
+		 * changed)
+		 */
+		__bfq_requeue_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd, entity) &&
+		    !expiration)
+			/*
+			 * next_in_service unchanged or not causing
+			 * any change in entity->parent->sd, and no
+			 * requeueing needed for expiration: stop
+			 * here.
+			 */
+			break;
+	}
+}
+
+/**
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
+ * @st: the service tree to act upon.
+ *
+ * Assumes that st is not empty.
+ */
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
+{
+	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+	if (bfq_gt(root_entity->min_start, st->vtime))
+		return root_entity->min_start;
+
+	return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+	if (new_value > st->vtime) {
+		st->vtime = new_value;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+						  u64 vtime)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, vtime))
+			first = entry;
+
+		if (node->rb_left) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first)
+			break;
+		node = node->rb_right;
+	}
+
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+	struct bfq_entity *entity;
+	u64 new_vtime;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	/*
+	 * Get the value of the system virtual time for which at
+	 * least one entity is eligible.
+	 */
+	new_vtime = bfq_calc_vtime_jump(st);
+
+	/*
+	 * If there is no in-service entity for the sched_data this
+	 * active tree belongs to, then push the system virtual time
+	 * up to the value that guarantees that at least one entity is
+	 * eligible. If, instead, there is an in-service entity, then
+	 * do not make any such update, because there is already an
+	 * eligible entity, namely the in-service one (even if the
+	 * entity is not on st, because it was extracted when set in
+	 * service).
+	 */
+	if (!in_service)
+		bfq_update_vtime(st, new_vtime);
+
+	entity = bfq_first_active_entity(st, new_vtime);
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+	struct bfq_entity *entity = NULL;
+	int class_idx = 0;
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class (and if there is some active entity
+	 * in idle class). This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+				   BFQ_CL_IDLE_TIMEOUT)) {
+		if (!RB_EMPTY_ROOT(&idle_class_st->active))
+			class_idx = BFQ_IOPRIO_CLASSES - 1;
+		/* About to be served if backlogged, or not yet backlogged */
+		sd->bfq_class_idle_last_service = jiffies;
+	}
+
+	/*
+	 * Find the next entity to serve for the highest-priority
+	 * class, unless the idle class needs to be served.
+	 */
+	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		entity = __bfq_lookup_next_entity(st + class_idx,
+						  sd->in_service_entity);
+
+		if (entity)
+			break;
+	}
+
+	if (!entity)
+		return NULL;
+
+	return entity;
+}
+
+bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	/*
+	 * Traverse the path from the root to the leaf entity to
+	 * serve. Set in service all the entities visited along the
+	 * way.
+	 */
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		/*
+		 * WARNING. We are about to set the in-service entity
+		 * to sd->next_in_service, i.e., to the (cached) value
+		 * returned by bfq_lookup_next_entity(sd) the last
+		 * time it was invoked, i.e., the last time when the
+		 * service order in sd changed as a consequence of the
+		 * activation or deactivation of an entity. In this
+		 * respect, if we execute bfq_lookup_next_entity(sd)
+		 * in this very moment, it may, although with low
+		 * probability, yield a different entity than that
+		 * pointed to by sd->next_in_service. This rare event
+		 * happens in case there was no CLASS_IDLE entity to
+		 * serve for sd when bfq_lookup_next_entity(sd) was
+		 * invoked for the last time, while there is now one
+		 * such entity.
+		 *
+		 * If the above event happens, then the scheduling of
+		 * such entity in CLASS_IDLE is postponed until the
+		 * service of the sd->next_in_service entity
+		 * finishes. In fact, when the latter is expired,
+		 * bfq_lookup_next_entity(sd) gets called again,
+		 * exactly to update sd->next_in_service.
+		 */
+
+		/* Make next_in_service entity become in_service_entity */
+		entity = sd->next_in_service;
+		sd->in_service_entity = entity;
+
+		/*
+		 * Reset the accumulator of the amount of service that
+		 * the entity is about to receive.
+		 */
+		entity->service = 0;
+
+		/*
+		 * If entity is no longer a candidate for next
+		 * service, then we extract it from its active tree,
+		 * for the following reason. To further boost the
+		 * throughput in some special case, BFQ needs to know
+		 * which is the next candidate entity to serve, while
+		 * there is already an entity in service. In this
+		 * respect, to make it easy to compute/update the next
+		 * candidate entity to serve after the current
+		 * candidate has been set in service, there is a case
+		 * where it is necessary to extract the current
+		 * candidate from its service tree. Such a case is
+		 * when the entity just set in service cannot be also
+		 * a candidate for next service. Details about when
+		 * this conditions holds are reported in the comments
+		 * on the function bfq_no_longer_next_in_service()
+		 * invoked below.
+		 */
+		if (bfq_no_longer_next_in_service(entity))
+			bfq_active_extract(bfq_entity_service_tree(entity),
+					   entity);
+
+		/*
+		 * For the same reason why we may have just extracted
+		 * entity from its active tree, we may need to update
+		 * next_in_service for the sched_data of entity too,
+		 * regardless of whether entity has been extracted.
+		 * In fact, even if entity has not been extracted, a
+		 * descendant entity may get extracted. Such an event
+		 * would cause a change in next_in_service for the
+		 * level of the descendant entity, and thus possibly
+		 * back to upper levels.
+		 *
+		 * We cannot perform the resulting needed update
+		 * before the end of this loop, because, to know which
+		 * is the correct next-to-serve candidate entity for
+		 * each level, we need first to find the leaf entity
+		 * to set in service. In fact, only after we know
+		 * which is the next-to-serve leaf entity, we can
+		 * discover whether the parent entity of the leaf
+		 * entity becomes the next-to-serve, and so on.
+		 */
+
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+
+	/*
+	 * We can finally update all next-to-serve entities along the
+	 * path from the leaf entity just set in service to the root.
+	 */
+	for_each_entity(entity) {
+		struct bfq_sched_data *sd = entity->sched_data;
+
+		if (!bfq_update_next_in_service(sd, NULL))
+			break;
+	}
+
+	return bfqq;
+}
+
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	struct bfq_entity *entity = in_serv_entity;
+
+	bfq_clear_bfqq_wait_request(in_serv_bfqq);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+
+	/*
+	 * When this function is called, all in-service entities have
+	 * been properly deactivated or requeued, so we can safely
+	 * execute the final step: reset in_service_entity along the
+	 * path from entity to the root.
+	 */
+	for_each_entity(entity)
+		entity->sched_data->in_service_entity = NULL;
+
+	/*
+	 * in_serv_entity is no longer in service, so, if it is in no
+	 * service tree either, then release the service reference to
+	 * the queue it represents (taken with bfq_get_entity).
+	 */
+	if (!in_serv_entity->on_st)
+		bfq_put_queue(in_serv_bfqq);
+}
+
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			 bool ins_into_idle_tree, bool expiration)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+				    false);
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, false,
+				    bfqq == bfqd->in_service_queue);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		       bool expiration)
+{
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	bfqd->busy_queues--;
+
+	if (!bfqq->dispatched)
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues--;
+
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+
+	if (!bfqq->dispatched)
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues++;
+}
-- 
2.10.0

^ permalink raw reply related

* [PATCH V2 15/16] block, bfq: remove all get and put of I/O contexts
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

When a bfq queue is set in service and when it is merged, a reference
to the I/O context associated with the queue is taken. This reference
is then released when the queue is deselected from service or
split. More precisely, the release of the reference is postponed to
when the scheduler lock is released, to avoid nesting between the
scheduler and the I/O-context lock. In fact, such nesting would lead
to deadlocks, because of other code paths that take the same locks in
the opposite order. This postponing of I/O-context releases does
complicate code.

This commit addresses these issue by modifying involved operations in
such a way to not need to get the above I/O-context references any
more. Then it also removes any get and release of these references.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
---
 block/bfq-iosched.c | 143 +++++++++-------------------------------------------
 1 file changed, 23 insertions(+), 120 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 8565784..831b406 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -538,8 +538,6 @@ struct bfq_data {
 
 	/* bfq_queue in service */
 	struct bfq_queue *in_service_queue;
-	/* bfq_io_cq (bic) associated with the @in_service_queue */
-	struct bfq_io_cq *in_service_bic;
 
 	/* on-disk position of the last served request */
 	sector_t last_position;
@@ -704,15 +702,6 @@ struct bfq_data {
 	struct bfq_io_cq *bio_bic;
 	/* bfqq associated with the task issuing current bio for merging */
 	struct bfq_queue *bio_bfqq;
-
-	/*
-	 * io context to put right after bfqd->lock is released. This
-	 * filed is used to perform put_io_context, when needed, to
-	 * after the scheduler lock has been released, and thus
-	 * prevent an ioc->lock from being possibly taken while the
-	 * scheduler lock is being held.
-	 */
-	struct io_context *ioc_to_put;
 };
 
 enum bfqq_state_flags {
@@ -1148,34 +1137,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd)
 	}
 }
 
-/*
- * Next two functions release bfqd->lock and put the io context
- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
- * to take an ioc->lock while the scheduler lock is being held.
- */
-static void bfq_unlock_put_ioc(struct bfq_data *bfqd)
-{
-	struct io_context *ioc_to_put = bfqd->ioc_to_put;
-
-	bfqd->ioc_to_put = NULL;
-	spin_unlock_irq(&bfqd->lock);
-
-	if (ioc_to_put)
-		put_io_context(ioc_to_put);
-}
-
-static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd,
-				       unsigned long flags)
-{
-	struct io_context *ioc_to_put = bfqd->ioc_to_put;
-
-	bfqd->ioc_to_put = NULL;
-	spin_unlock_irqrestore(&bfqd->lock, flags);
-
-	if (ioc_to_put)
-		put_io_context(ioc_to_put);
-}
-
 /**
  * bfq_gt - compare two timestamps.
  * @a: first ts.
@@ -2684,18 +2645,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
 	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
 	struct bfq_entity *entity = in_serv_entity;
 
-	if (bfqd->in_service_bic) {
-		/*
-		 * Schedule the release of a reference to
-		 * bfqd->in_service_bic->icq.ioc to right after the
-		 * scheduler lock is released. This ioc is not
-		 * released immediately, to not risk to possibly take
-		 * an ioc->lock while holding the scheduler lock.
-		 */
-		bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc;
-		bfqd->in_service_bic = NULL;
-	}
-
 	bfq_clear_bfqq_wait_request(in_serv_bfqq);
 	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
 	bfqd->in_service_queue = NULL;
@@ -3495,7 +3444,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 	__bfq_deactivate_entity(entity, false);
 	bfq_put_async_queues(bfqd, bfqg);
 
-	bfq_unlock_put_ioc_restore(bfqd, flags);
+	spin_unlock_irqrestore(&bfqd->lock, flags);
 	/*
 	 * @blkg is going offline and will be ignored by
 	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
@@ -5456,20 +5405,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 	 * first time that the requests of some process are redirected to
 	 * it.
 	 *
-	 * We redirect bfqq to new_bfqq and not the opposite, because we
-	 * are in the context of the process owning bfqq, hence we have
-	 * the io_cq of this process. So we can immediately configure this
-	 * io_cq to redirect the requests of the process to new_bfqq.
+	 * We redirect bfqq to new_bfqq and not the opposite, because
+	 * we are in the context of the process owning bfqq, thus we
+	 * have the io_cq of this process. So we can immediately
+	 * configure this io_cq to redirect the requests of the
+	 * process to new_bfqq. In contrast, the io_cq of new_bfqq is
+	 * not available any more (new_bfqq->bic == NULL).
 	 *
-	 * NOTE, even if new_bfqq coincides with the in-service queue, the
-	 * io_cq of new_bfqq is not available, because, if the in-service
-	 * queue is shared, bfqd->in_service_bic may not point to the
-	 * io_cq of the in-service queue.
-	 * Redirecting the requests of the process owning bfqq to the
-	 * currently in-service queue is in any case the best option, as
-	 * we feed the in-service queue with new requests close to the
-	 * last request served and, by doing so, hopefully increase the
-	 * throughput.
+	 * Anyway, even in case new_bfqq coincides with the in-service
+	 * queue, redirecting requests the in-service queue is the
+	 * best option, as we feed the in-service queue with new
+	 * requests close to the last request served and, by doing so,
+	 * are likely to increase the throughput.
 	 */
 	bfqq->new_bfqq = new_bfqq;
 	new_bfqq->ref += process_refs;
@@ -5561,8 +5508,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	in_service_bfqq = bfqd->in_service_queue;
 
-	if (!in_service_bfqq || in_service_bfqq == bfqq ||
-	    !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
+	if (!in_service_bfqq || in_service_bfqq == bfqq
+	    || wr_from_too_long(in_service_bfqq) ||
 	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
 		goto check_scheduled;
 
@@ -5613,16 +5560,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
 }
 
-static void bfq_get_bic_reference(struct bfq_queue *bfqq)
-{
-	/*
-	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
-	 * is about to begin using a shared bfq_queue.
-	 */
-	if (bfqq->bic)
-		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
-}
-
 static void
 bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
@@ -5667,12 +5604,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 		     bfqd->wr_busy_queues);
 
 	/*
-	 * Grab a reference to the bic, to prevent it from being destroyed
-	 * before being possibly touched by a bfq_split_bfqq().
-	 */
-	bfq_get_bic_reference(bfqq);
-	bfq_get_bic_reference(new_bfqq);
-	/*
 	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
 	 */
 	bic_set_bfqq(bic, new_bfqq, 1);
@@ -5837,14 +5768,8 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 {
 	struct bfq_queue *bfqq = bfqd->in_service_queue;
-	struct bfq_io_cq *bic;
 	u32 sl;
 
-	/* Processes have exited, don't wait. */
-	bic = bfqd->in_service_bic;
-	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
-		return;
-
 	bfq_mark_bfqq_wait_request(bfqq);
 
 	/*
@@ -7131,11 +7056,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
 	 */
 	bfq_update_wr_data(bfqd, bfqq);
 
-	if (!bfqd->in_service_bic) {
-		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
-		bfqd->in_service_bic = RQ_BIC(rq);
-	}
-
 	/*
 	 * Expire bfqq, pretending that its budget expired, if bfqq
 	 * belongs to CLASS_IDLE and other queues are waiting for
@@ -7256,7 +7176,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	spin_lock_irq(&bfqd->lock);
 
 	rq = __bfq_dispatch_request(hctx);
-	bfq_unlock_put_ioc(bfqd);
+	spin_unlock_irq(&bfqd->lock);
 
 	return rq;
 }
@@ -7344,20 +7264,9 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
 		unsigned long flags;
 
 		spin_lock_irqsave(&bfqd->lock, flags);
-		/*
-		 * If the bic is using a shared queue, put the
-		 * reference taken on the io_context when the bic
-		 * started using a shared bfq_queue. This put cannot
-		 * make ioc->ref_count reach 0, then no ioc->lock
-		 * risks to be taken (leading to possible deadlock
-		 * scenarios).
-		 */
-		if (is_sync && bfq_bfqq_coop(bfqq))
-			put_io_context(bic->icq.ioc);
-
 		bfq_exit_bfqq(bfqd, bfqq);
 		bic_set_bfqq(bic, NULL, is_sync);
-		bfq_unlock_put_ioc_restore(bfqd, flags);
+		spin_unlock_irqrestore(&bfqd->lock, flags);
 	}
 }
 
@@ -7791,7 +7700,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		}
 	}
 
-	bfq_unlock_put_ioc(bfqd);
+	spin_unlock_irq(&bfqd->lock);
 }
 
 static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -7945,7 +7854,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
 		bfq_completed_request(bfqq, bfqd);
 		bfq_put_rq_priv_body(bfqq);
 
-		bfq_unlock_put_ioc_restore(bfqd, flags);
+		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
 		/*
 		 * Request rq may be still/already in the scheduler,
@@ -8038,6 +7947,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	const int is_sync = rq_is_sync(rq);
 	struct bfq_queue *bfqq;
 	bool new_queue = false;
+	bool split = false;
 
 	spin_lock_irq(&bfqd->lock);
 
@@ -8061,14 +7971,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 				bic->saved_in_large_burst = true;
 
 			bfqq = bfq_split_bfqq(bic, bfqq);
-			/*
-			 * A reference to bic->icq.ioc needs to be
-			 * released after a queue split. Do not do it
-			 * immediately, to not risk to possibly take
-			 * an ioc->lock while holding the scheduler
-			 * lock.
-			 */
-			bfqd->ioc_to_put = bic->icq.ioc;
+			split = true;
 
 			if (!bfqq)
 				bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
@@ -8093,7 +7996,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	 */
 	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
 		bfqq->bic = bic;
-		if (bfqd->ioc_to_put) { /* if true, there has been a split */
+		if (split) {
 			/*
 			 * The queue has just been split from a shared
 			 * queue: restore the idle window and the
@@ -8106,7 +8009,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	if (unlikely(bfq_bfqq_just_created(bfqq)))
 		bfq_handle_burst(bfqd, bfqq);
 
-	bfq_unlock_put_ioc(bfqd);
+	spin_unlock_irq(&bfqd->lock);
 
 	return 0;
 
@@ -8151,7 +8054,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
 	bfq_bfqq_expire(bfqd, bfqq, true, reason);
 
 schedule_dispatch:
-	bfq_unlock_put_ioc_restore(bfqd, flags);
+	spin_unlock_irqrestore(&bfqd->lock, flags);
 	bfq_schedule_dispatch(bfqd);
 }
 
-- 
2.10.0

^ permalink raw reply related

* [PATCH V2 14/16] block, bfq: handle bursts of queue activations
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

From: Arianna Avanzini <avanzini.arianna@gmail.com>

Many popular I/O-intensive services or applications spawn or
reactivate many parallel threads/processes during short time
intervals. Examples are systemd during boot or git grep.  These
services or applications benefit mostly from a high throughput: the
quicker the I/O generated by their processes is cumulatively served,
the sooner the target job of these services or applications gets
completed. As a consequence, it is almost always counterproductive to
weight-raise any of the queues associated to the processes of these
services or applications: in most cases it would just lower the
throughput, mainly because weight-raising also implies device idling.

To address this issue, an I/O scheduler needs, first, to detect which
queues are associated with these services or applications. In this
respect, we have that, from the I/O-scheduler standpoint, these
services or applications cause bursts of activations, i.e.,
activations of different queues occurring shortly after each
other. However, a shorter burst of activations may be caused also by
the start of an application that does not consist in a lot of parallel
I/O-bound threads (see the comments on the function bfq_handle_burst
for details).

In view of these facts, this commit introduces:
1) an heuristic to detect (only) bursts of queue activations caused by
   services or applications consisting in many parallel I/O-bound
   threads;
2) the prevention of device idling and weight-raising for the queues
   belonging to these bursts.

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
---
 block/bfq-iosched.c | 403 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 388 insertions(+), 15 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 738fd5f..8565784 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -360,6 +360,10 @@ struct bfq_queue {
 
 	/* bit vector: a 1 for each seeky requests in history */
 	u32 seek_history;
+
+	/* node for the device's burst list */
+	struct hlist_node burst_list_node;
+
 	/* position of the last request enqueued */
 	sector_t last_request_pos;
 
@@ -443,6 +447,17 @@ struct bfq_io_cq {
 	bool saved_IO_bound;
 
 	/*
+	 * Same purpose as the previous fields for the value of the
+	 * field keeping the queue's belonging to a large burst
+	 */
+	bool saved_in_large_burst;
+	/*
+	 * True if the queue belonged to a burst list before its merge
+	 * with another cooperating queue.
+	 */
+	bool was_in_burst_list;
+
+	/*
 	 * Similar to previous fields: save wr information.
 	 */
 	unsigned long saved_wr_coeff;
@@ -609,6 +624,36 @@ struct bfq_data {
 	 */
 	bool strict_guarantees;
 
+	/*
+	 * Last time at which a queue entered the current burst of
+	 * queues being activated shortly after each other; for more
+	 * details about this and the following parameters related to
+	 * a burst of activations, see the comments on the function
+	 * bfq_handle_burst.
+	 */
+	unsigned long last_ins_in_burst;
+	/*
+	 * Reference time interval used to decide whether a queue has
+	 * been activated shortly after @last_ins_in_burst.
+	 */
+	unsigned long bfq_burst_interval;
+	/* number of queues in the current burst of queue activations */
+	int burst_size;
+
+	/* common parent entity for the queues in the burst */
+	struct bfq_entity *burst_parent_entity;
+	/* Maximum burst size above which the current queue-activation
+	 * burst is deemed as 'large'.
+	 */
+	unsigned long bfq_large_burst_thresh;
+	/* true if a large queue-activation burst is in progress */
+	bool large_burst;
+	/*
+	 * Head of the burst list (as for the above fields, more
+	 * details in the comments on the function bfq_handle_burst).
+	 */
+	struct hlist_head burst_list;
+
 	/* if set to true, low-latency heuristics are enabled */
 	bool low_latency;
 	/*
@@ -671,7 +716,8 @@ struct bfq_data {
 };
 
 enum bfqq_state_flags {
-	BFQQF_busy = 0,		/* has requests or is in service */
+	BFQQF_just_created = 0,	/* queue just allocated */
+	BFQQF_busy,		/* has requests or is in service */
 	BFQQF_wait_request,	/* waiting for a request */
 	BFQQF_non_blocking_wait_rq, /*
 				     * waiting for a request
@@ -685,6 +731,10 @@ enum bfqq_state_flags {
 				 * having consumed at most 2/10 of
 				 * its budget
 				 */
+	BFQQF_in_large_burst,	/*
+				 * bfqq activated in a large burst,
+				 * see comments to bfq_handle_burst.
+				 */
 	BFQQF_softrt_update,	/*
 				 * may need softrt-next-start
 				 * update
@@ -707,6 +757,7 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
 	return test_bit(BFQQF_##name, &(bfqq)->flags);		\
 }
 
+BFQ_BFQQ_FNS(just_created);
 BFQ_BFQQ_FNS(busy);
 BFQ_BFQQ_FNS(wait_request);
 BFQ_BFQQ_FNS(non_blocking_wait_rq);
@@ -714,6 +765,7 @@ BFQ_BFQQ_FNS(fifo_expire);
 BFQ_BFQQ_FNS(idle_window);
 BFQ_BFQQ_FNS(sync);
 BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
@@ -4287,9 +4339,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
 	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
 
-	if (bfqq->wr_coeff > 1 &&
+	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
 	    time_is_before_jiffies(bfqq->last_wr_start_finish +
-				   bfqq->wr_cur_max_time)) {
+				   bfqq->wr_cur_max_time))) {
 		bfq_log_bfqq(bfqq->bfqd, bfqq,
 		    "resume state: switching off wr");
 
@@ -4305,6 +4357,232 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
 	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
 }
 
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+	bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+				     burst_list_node)
+			bfq_mark_bfqq_in_large_burst(bfqq_item);
+		bfq_mark_bfqq_in_large_burst(bfqq);
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /*
+		* Burst not yet large: add bfqq to the burst list. Do
+		* not increment the ref counter for bfqq, because bfqq
+		* is removed from the burst list before freeing bfqq
+		* in put_queue.
+		*/
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, or finally has just been split, then there is
+	 * nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_after_eq_jiffies(bfqq->split_time +
+				     msecs_to_jiffies(10)))
+		return;
+
+	/*
+	 * If bfqq's creation happens late enough, or bfqq belongs to
+	 * a different group than the burst group, then the current
+	 * burst is finished, and related data structures must be
+	 * reset.
+	 *
+	 * In this respect, consider the special case where bfqq is
+	 * the very first queue created after BFQ is selected for this
+	 * device. In this case, last_ins_in_burst and
+	 * burst_parent_entity are not yet significant when we get
+	 * here. But it is easy to verify that, whether or not the
+	 * following condition is true, bfqq will end up being
+	 * inserted into the burst list. In particular the list will
+	 * happen to contain only bfqq. And this is exactly what has
+	 * to happen, as bfqq may be the first queue of the first
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval) ||
+	    bfqq->entity.parent != bfqd->burst_parent_entity) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		goto end;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		goto end;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+end:
+	/*
+	 * At this point, bfqq either has been added to the current
+	 * burst or has caused the current burst to terminate and a
+	 * possible new burst to start. In particular, in the second
+	 * case, bfqq has become the first queue in the possible new
+	 * burst.  In both cases last_ins_in_burst needs to be moved
+	 * forward.
+	 */
+	bfqd->last_ins_in_burst = jiffies;
+}
+
 static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
@@ -4518,6 +4796,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 					     unsigned int old_wr_coeff,
 					     bool wr_or_deserves_wr,
 					     bool interactive,
+					     bool in_burst,
 					     bool soft_rt)
 {
 	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
@@ -4549,7 +4828,9 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 		if (interactive) { /* update wr coeff and duration */
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-		} else if (soft_rt) {
+		} else if (in_burst)
+			bfqq->wr_coeff = 1;
+		else if (soft_rt) {
 			/*
 			 * The application is now or still meeting the
 			 * requirements for being deemed soft rt.  We
@@ -4609,7 +4890,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 					     struct request *rq,
 					     bool *interactive)
 {
-	bool soft_rt, wr_or_deserves_wr, bfqq_wants_to_preempt,
+	bool soft_rt, in_burst,	wr_or_deserves_wr,
+		bfqq_wants_to_preempt,
 		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
 		/*
 		 * See the comments on
@@ -4625,12 +4907,15 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	/*
 	 * bfqq deserves to be weight-raised if:
 	 * - it is sync,
+	 * - it does not belong to a large burst,
 	 * - it has been idle for enough time or is soft real-time,
 	 * - is linked to a bfq_io_cq (it is not shared in any sense).
 	 */
+	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+		!in_burst &&
 		time_is_before_jiffies(bfqq->soft_rt_next_start);
-	*interactive = idle_for_long_time;
+	*interactive = !in_burst && idle_for_long_time;
 	wr_or_deserves_wr = bfqd->low_latency &&
 		(bfqq->wr_coeff > 1 ||
 		 (bfq_bfqq_sync(bfqq) &&
@@ -4645,6 +4930,31 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 						    arrived_in_time,
 						    wr_or_deserves_wr);
 
+	/*
+	 * If bfqq happened to be activated in a burst, but has been
+	 * idle for much more than an interactive queue, then we
+	 * assume that, in the overall I/O initiated in the burst, the
+	 * I/O associated with bfqq is finished. So bfqq does not need
+	 * to be treated as a queue belonging to a burst
+	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
+	 * if set, and remove bfqq from the burst list if it's
+	 * there. We do not decrement burst_size, because the fact
+	 * that bfqq does not need to belong to the burst list any
+	 * more does not invalidate the fact that bfqq was created in
+	 * a burst.
+	 */
+	if (likely(!bfq_bfqq_just_created(bfqq)) &&
+	    idle_for_long_time &&
+	    time_is_before_jiffies(
+		    bfqq->budget_timeout +
+		    msecs_to_jiffies(10000))) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	bfq_clear_bfqq_just_created(bfqq);
+
+
 	if (!bfq_bfqq_IO_bound(bfqq)) {
 		if (arrived_in_time) {
 			bfqq->requests_within_timer++;
@@ -4667,6 +4977,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 							 old_wr_coeff,
 							 wr_or_deserves_wr,
 							 *interactive,
+							 in_burst,
 							 soft_rt);
 
 			if (old_wr_coeff != bfqq->wr_coeff)
@@ -5294,6 +5605,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bic->saved_ttime = bfqq->ttime;
 	bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
 	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
 	bic->saved_wr_coeff = bfqq->wr_coeff;
 	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
 	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
@@ -5329,7 +5642,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	 * where bfqq has just been created, but has not yet made it
 	 * to be weight-raised (which may happen because EQM may merge
 	 * bfqq even before bfq_add_request is executed for the first
-	 * time for bfqq).
+	 * time for bfqq). Handling this case would however be very
+	 * easy, thanks to the flag just_created.
 	 */
 	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
 		new_bfqq->wr_coeff = bfqq->wr_coeff;
@@ -6414,6 +6728,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		idling_needed_for_service_guarantees,
 		asymmetric_scenario;
 
 	if (bfqd->strict_guarantees)
@@ -6594,6 +6909,23 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		!bfq_symmetric_scenario(bfqd);
 
 	/*
+	 * Finally, there is a case where maximizing throughput is the
+	 * best choice even if it may cause unfairness toward
+	 * bfqq. Such a case is when bfqq became active in a burst of
+	 * queue activations. Queues that became active during a large
+	 * burst benefit only from throughput, as discussed in the
+	 * comments on bfq_handle_burst. Thus, if bfqq became active
+	 * in a burst and not idling the device maximizes throughput,
+	 * then the device must no be idled, because not idling the
+	 * device provides bfqq and all other queues in the burst with
+	 * maximum benefit. Combining this and the above case, we can
+	 * now establish when idling is actually needed to preserve
+	 * service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+	/*
 	 * We have now all the components we need to compute the return
 	 * value of the function, which is true only if both the following
 	 * conditions hold:
@@ -6602,7 +6934,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 *    is necessary to preserve service guarantees.
 	 */
 	return bfq_bfqq_sync(bfqq) &&
-		(idling_boosts_thr_without_issues || asymmetric_scenario);
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
 }
 
 /*
@@ -6741,14 +7074,17 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
 
 		/*
-		 * If too much time has elapsed from the beginning of
-		 * this weight-raising period, then end weight raising.
+		 * If the queue was activated in a burst, or too much
+		 * time has elapsed from the beginning of this
+		 * weight-raising period, then end weight raising.
 		 */
-		if (time_is_before_jiffies(bfqq->last_wr_start_finish +
-					   bfqq->wr_cur_max_time)) {
+		if (bfq_bfqq_in_large_burst(bfqq))
+			bfq_bfqq_end_wr(bfqq);
+		else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+						bfqq->wr_cur_max_time)) {
 			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
 			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
-						   bfq_wr_duration(bfqd)))
+					       bfq_wr_duration(bfqd)))
 				bfq_bfqq_end_wr(bfqq);
 			else {
 				/* switch back to interactive wr */
@@ -6946,7 +7282,16 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+	if (bfq_bfqq_sync(bfqq))
+		/*
+		 * The fact that this queue is being destroyed does not
+		 * invalidate the fact that this queue may have been
+		 * activated during the current burst. As a consequence,
+		 * although the queue does not exist anymore, and hence
+		 * needs to be removed from the burst list if there,
+		 * the burst size has not to be decremented.
+		 */
+		hlist_del_init(&bfqq->burst_list_node);
 
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -7108,6 +7453,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 {
 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
 	INIT_LIST_HEAD(&bfqq->fifo);
+	INIT_HLIST_NODE(&bfqq->burst_list_node);
 
 	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
@@ -7119,6 +7465,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		if (!bfq_class_idle(bfqq))
 			bfq_mark_bfqq_idle_window(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
+		bfq_mark_bfqq_just_created(bfqq);
 	} else
 		bfq_clear_bfqq_sync(bfqq);
 
@@ -7664,8 +8011,18 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
 
 	bic_set_bfqq(bic, bfqq, is_sync);
-	if (split && is_sync)
+	if (split && is_sync) {
+		if ((bic->was_in_burst_list && bfqd->large_burst) ||
+		    bic->saved_in_large_burst)
+			bfq_mark_bfqq_in_large_burst(bfqq);
+		else {
+			bfq_clear_bfqq_in_large_burst(bfqq);
+			if (bic->was_in_burst_list)
+				hlist_add_head(&bfqq->burst_list_node,
+					       &bfqd->burst_list);
+		}
 		bfqq->split_time = jiffies;
+	}
 
 	return bfqq;
 }
@@ -7698,6 +8055,11 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 		/* If the queue was seeky for too long, break it apart. */
 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+			/* Update bic before losing reference to bfqq */
+			if (bfq_bfqq_in_large_burst(bfqq))
+				bic->saved_in_large_burst = true;
+
 			bfqq = bfq_split_bfqq(bic, bfqq);
 			/*
 			 * A reference to bic->icq.ioc needs to be
@@ -7741,6 +8103,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 		}
 	}
 
+	if (unlikely(bfq_bfqq_just_created(bfqq)))
+		bfq_handle_burst(bfqd, bfqq);
+
 	bfq_unlock_put_ioc(bfqd);
 
 	return 0;
@@ -7920,6 +8285,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
 	bfqd->oom_bfqq.entity.new_weight =
 		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
 	/*
 	 * Trigger weight initialization, according to ioprio, at the
 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
@@ -7940,6 +8309,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	INIT_LIST_HEAD(&bfqd->active_list);
 	INIT_LIST_HEAD(&bfqd->idle_list);
+	INIT_HLIST_HEAD(&bfqd->burst_list);
 
 	bfqd->hw_tag = -1;
 
@@ -7954,6 +8324,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->bfq_requests_within_timer = 120;
 
+	bfqd->bfq_large_burst_thresh = 8;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
 	bfqd->low_latency = true;
 
 	/*
-- 
2.10.0

^ permalink raw reply related

* [PATCH V2 13/16] block, bfq: boost the throughput with random I/O on NCQ-capable HDDs
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

This patch is basically the counterpart, for NCQ-capable rotational
devices, of the previous patch. Exactly as the previous patch does on
flash-based devices and for any workload, this patch disables device
idling on rotational devices, but only for random I/O. In fact, only
with these queues disabling idling boosts the throughput on
NCQ-capable rotational devices. To not break service guarantees,
idling is disabled for NCQ-enabled rotational devices only when the
same symmetry conditions considered in the previous patches hold.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
---
 block/bfq-iosched.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ca1ae24..738fd5f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6423,20 +6423,15 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * The next variable takes into account the cases where idling
 	 * boosts the throughput.
 	 *
-	 * The value of the variable is computed considering that
-	 * idling is usually beneficial for the throughput if:
+	 * The value of the variable is computed considering, first, that
+	 * idling is virtually always beneficial for the throughput if:
 	 * (a) the device is not NCQ-capable, or
 	 * (b) regardless of the presence of NCQ, the device is rotational
-	 *     and the request pattern for bfqq is I/O-bound (possible
-	 *     throughput losses caused by granting idling to seeky queues
-	 *     are mitigated by the fact that, in all scenarios where
-	 *     boosting throughput is the best thing to do, i.e., in all
-	 *     symmetric scenarios, only a minimal idle time is allowed to
-	 *     seeky queues).
+	 *     and the request pattern for bfqq is I/O-bound and sequential.
 	 *
 	 * Secondly, and in contrast to the above item (b), idling an
 	 * NCQ-capable flash-based device would not boost the
-	 * throughput even with intense I/O; rather it would lower
+	 * throughput even with sequential I/O; rather it would lower
 	 * the throughput in proportion to how fast the device
 	 * is. Accordingly, the next variable is true if any of the
 	 * above conditions (a) and (b) is true, and, in particular,
@@ -6444,7 +6439,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * device.
 	 */
 	idling_boosts_thr = !bfqd->hw_tag ||
-		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq));
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+		 bfq_bfqq_idle_window(bfqq));
 
 	/*
 	 * The value of the next variable,
-- 
2.10.0

^ permalink raw reply related

* [PATCH V2 12/16] block, bfq: boost the throughput on NCQ-capable flash-based devices
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

This patch boosts the throughput on NCQ-capable flash-based devices,
while still preserving latency guarantees for interactive and soft
real-time applications. The throughput is boosted by just not idling
the device when the in-service queue remains empty, even if the queue
is sync and has a non-null idle window. This helps to keep the drive's
internal queue full, which is necessary to achieve maximum
performance. This solution to boost the throughput is a port of
commits a68bbdd and f7d7b7a for CFQ.

As already highlighted in a previous patch, allowing the device to
prefetch and internally reorder requests trivially causes loss of
control on the request service order, and hence on service guarantees.
Fortunately, as discussed in detail in the comments on the function
bfq_bfqq_may_idle(), if every process has to receive the same
fraction of the throughput, then the service order enforced by the
internal scheduler of a flash-based device is relatively close to that
enforced by BFQ. In particular, it is close enough to let service
guarantees be substantially preserved.

Things change in an asymmetric scenario, i.e., if not every process
has to receive the same fraction of the throughput. In this case, to
guarantee the desired throughput distribution, the device must be
prevented from prefetching requests. This is exactly what this patch
does in asymmetric scenarios.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
---
 block/bfq-iosched.c | 154 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 106 insertions(+), 48 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ec87cff..ca1ae24 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6426,15 +6426,25 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * The value of the variable is computed considering that
 	 * idling is usually beneficial for the throughput if:
 	 * (a) the device is not NCQ-capable, or
-	 * (b) regardless of the presence of NCQ, the request pattern
-	 *     for bfqq is I/O-bound (possible throughput losses
-	 *     caused by granting idling to seeky queues are mitigated
-	 *     by the fact that, in all scenarios where boosting
-	 *     throughput is the best thing to do, i.e., in all
-	 *     symmetric scenarios, only a minimal idle time is
-	 *     allowed to seeky queues).
+	 * (b) regardless of the presence of NCQ, the device is rotational
+	 *     and the request pattern for bfqq is I/O-bound (possible
+	 *     throughput losses caused by granting idling to seeky queues
+	 *     are mitigated by the fact that, in all scenarios where
+	 *     boosting throughput is the best thing to do, i.e., in all
+	 *     symmetric scenarios, only a minimal idle time is allowed to
+	 *     seeky queues).
+	 *
+	 * Secondly, and in contrast to the above item (b), idling an
+	 * NCQ-capable flash-based device would not boost the
+	 * throughput even with intense I/O; rather it would lower
+	 * the throughput in proportion to how fast the device
+	 * is. Accordingly, the next variable is true if any of the
+	 * above conditions (a) and (b) is true, and, in particular,
+	 * happens to be false if bfqd is an NCQ-capable flash-based
+	 * device.
 	 */
-	idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
+	idling_boosts_thr = !bfqd->hw_tag ||
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq));
 
 	/*
 	 * The value of the next variable,
@@ -6475,14 +6485,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		bfqd->wr_busy_queues == 0;
 
 	/*
-	 * There is then a case where idling must be performed not for
-	 * throughput concerns, but to preserve service guarantees. To
-	 * introduce it, we can note that allowing the drive to
-	 * enqueue more than one request at a time, and hence
+	 * There is then a case where idling must be performed not
+	 * for throughput concerns, but to preserve service
+	 * guarantees.
+	 *
+	 * To introduce this case, we can note that allowing the drive
+	 * to enqueue more than one request at a time, and hence
 	 * delegating de facto final scheduling decisions to the
-	 * drive's internal scheduler, causes loss of control on the
+	 * drive's internal scheduler, entails loss of control on the
 	 * actual request service order. In particular, the critical
-	 * situation is when requests from different processes happens
+	 * situation is when requests from different processes happen
 	 * to be present, at the same time, in the internal queue(s)
 	 * of the drive. In such a situation, the drive, by deciding
 	 * the service order of the internally-queued requests, does
@@ -6493,51 +6505,97 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * the service distribution enforced by the drive's internal
 	 * scheduler is likely to coincide with the desired
 	 * device-throughput distribution only in a completely
-	 * symmetric scenario where: (i) each of these processes must
-	 * get the same throughput as the others; (ii) all these
-	 * processes have the same I/O pattern (either sequential or
-	 * random).  In fact, in such a scenario, the drive will tend
-	 * to treat the requests of each of these processes in about
-	 * the same way as the requests of the others, and thus to
-	 * provide each of these processes with about the same
-	 * throughput (which is exactly the desired throughput
-	 * distribution). In contrast, in any asymmetric scenario,
-	 * device idling is certainly needed to guarantee that bfqq
-	 * receives its assigned fraction of the device throughput
-	 * (see [1] for details).
+	 * symmetric scenario where:
+	 * (i)  each of these processes must get the same throughput as
+	 *      the others;
+	 * (ii) all these processes have the same I/O pattern
+		(either sequential or random).
+	 * In fact, in such a scenario, the drive will tend to treat
+	 * the requests of each of these processes in about the same
+	 * way as the requests of the others, and thus to provide
+	 * each of these processes with about the same throughput
+	 * (which is exactly the desired throughput distribution). In
+	 * contrast, in any asymmetric scenario, device idling is
+	 * certainly needed to guarantee that bfqq receives its
+	 * assigned fraction of the device throughput (see [1] for
+	 * details).
+	 *
+	 * We address this issue by controlling, actually, only the
+	 * symmetry sub-condition (i), i.e., provided that
+	 * sub-condition (i) holds, idling is not performed,
+	 * regardless of whether sub-condition (ii) holds. In other
+	 * words, only if sub-condition (i) holds, then idling is
+	 * allowed, and the device tends to be prevented from queueing
+	 * many requests, possibly of several processes. The reason
+	 * for not controlling also sub-condition (ii) is that we
+	 * exploit preemption to preserve guarantees in case of
+	 * symmetric scenarios, even if (ii) does not hold, as
+	 * explained in the next two paragraphs.
+	 *
+	 * Even if a queue, say Q, is expired when it remains idle, Q
+	 * can still preempt the new in-service queue if the next
+	 * request of Q arrives soon (see the comments on
+	 * bfq_bfqq_update_budg_for_activation). If all queues and
+	 * groups have the same weight, this form of preemption,
+	 * combined with the hole-recovery heuristic described in the
+	 * comments on function bfq_bfqq_update_budg_for_activation,
+	 * are enough to preserve a correct bandwidth distribution in
+	 * the mid term, even without idling. In fact, even if not
+	 * idling allows the internal queues of the device to contain
+	 * many requests, and thus to reorder requests, we can rather
+	 * safely assume that the internal scheduler still preserves a
+	 * minimum of mid-term fairness. The motivation for using
+	 * preemption instead of idling is that, by not idling,
+	 * service guarantees are preserved without minimally
+	 * sacrificing throughput. In other words, both a high
+	 * throughput and its desired distribution are obtained.
+	 *
+	 * More precisely, this preemption-based, idleless approach
+	 * provides fairness in terms of IOPS, and not sectors per
+	 * second. This can be seen with a simple example. Suppose
+	 * that there are two queues with the same weight, but that
+	 * the first queue receives requests of 8 sectors, while the
+	 * second queue receives requests of 1024 sectors. In
+	 * addition, suppose that each of the two queues contains at
+	 * most one request at a time, which implies that each queue
+	 * always remains idle after it is served. Finally, after
+	 * remaining idle, each queue receives very quickly a new
+	 * request. It follows that the two queues are served
+	 * alternatively, preempting each other if needed. This
+	 * implies that, although both queues have the same weight,
+	 * the queue with large requests receives a service that is
+	 * 1024/8 times as high as the service received by the other
+	 * queue.
 	 *
-	 * As for sub-condition (i), actually we check only whether
-	 * bfqq is being weight-raised. In fact, if bfqq is not being
-	 * weight-raised, we have that:
-	 * - if the process associated with bfqq is not I/O-bound, then
-	 *   it is not either latency- or throughput-critical; therefore
-	 *   idling is not needed for bfqq;
-	 * - if the process asociated with bfqq is I/O-bound, then
-	 *   idling is already granted with bfqq (see the comments on
-	 *   idling_boosts_thr).
+	 * On the other hand, device idling is performed, and thus
+	 * pure sector-domain guarantees are provided, for the
+	 * following queues, which are likely to need stronger
+	 * throughput guarantees: weight-raised queues, and queues
+	 * with a higher weight than other queues. When such queues
+	 * are active, sub-condition (i) is false, which triggers
+	 * device idling.
 	 *
-	 * We do not check sub-condition (ii) at all, i.e., the next
-	 * variable is true if and only if bfqq is being
-	 * weight-raised. We do not need to control sub-condition (ii)
-	 * for the following reason:
-	 * - if bfqq is being weight-raised, then idling is already
-	 *   guaranteed to bfqq by sub-condition (i);
-	 * - if bfqq is not being weight-raised, then idling is
-	 *   already guaranteed to bfqq (only) if it matters, i.e., if
-	 *   bfqq is associated to a currently I/O-bound process (see
-	 *   the above comment on sub-condition (i)).
+	 * According to the above considerations, the next variable is
+	 * true (only) if sub-condition (i) holds. To compute the
+	 * value of this variable, we not only use the return value of
+	 * the function bfq_symmetric_scenario(), but also check
+	 * whether bfqq is being weight-raised, because
+	 * bfq_symmetric_scenario() does not take into account also
+	 * weight-raised queues (see comments on
+	 * bfq_weights_tree_add()).
 	 *
 	 * As a side note, it is worth considering that the above
 	 * device-idling countermeasures may however fail in the
 	 * following unlucky scenario: if idling is (correctly)
-	 * disabled in a time period during which the symmetry
-	 * sub-condition holds, and hence the device is allowed to
+	 * disabled in a time period during which all symmetry
+	 * sub-conditions hold, and hence the device is allowed to
 	 * enqueue many requests, but at some later point in time some
 	 * sub-condition stops to hold, then it may become impossible
 	 * to let requests be served in the desired order until all
 	 * the requests already queued in the device have been served.
 	 */
-	asymmetric_scenario = bfqq->wr_coeff > 1;
+	asymmetric_scenario = bfqq->wr_coeff > 1 ||
+		!bfq_symmetric_scenario(bfqd);
 
 	/*
 	 * We have now all the components we need to compute the return
-- 
2.10.0

^ permalink raw reply related

* [PATCH V2 11/16] block, bfq: reduce idling only in symmetric scenarios
From: Paolo Valente @ 2017-03-31 12:47 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Fabio Checconi, Arianna Avanzini, linux-block, linux-kernel,
	ulf.hansson, linus.walleij, broonie, Riccardo Pizzetti,
	Samuele Zecchini, Paolo Valente
In-Reply-To: <20170331124743.3530-1-paolo.valente@linaro.org>

From: Arianna Avanzini <avanzini.arianna@gmail.com>

A seeky queue (i..e, a queue containing random requests) is assigned a
very small device-idling slice, for throughput issues. Unfortunately,
given the process associated with a seeky queue, this behavior causes
the following problem: if the process, say P, performs sync I/O and
has a higher weight than some other processes doing I/O and associated
with non-seeky queues, then BFQ may fail to guarantee to P its
reserved share of the throughput. The reason is that idling is key
for providing service guarantees to processes doing sync I/O [1].

This commit addresses this issue by allowing the device-idling slice
to be reduced for a seeky queue only if the scenario happens to be
symmetric, i.e., if all the queues are to receive the same share of
the throughput.

[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
    Scheduler", Proceedings of the First Workshop on Mobile System
    Technologies (MST-2015), May 2015.
    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Riccardo Pizzetti <riccardo.pizzetti@gmail.com>
Signed-off-by: Samuele Zecchini <samuele.zecchini92@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
---
 block/bfq-iosched.c | 271 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 264 insertions(+), 7 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index df74c0d..ec87cff 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -183,6 +183,20 @@ struct bfq_sched_data {
 };
 
 /**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ */
+struct bfq_weight_counter {
+	unsigned int weight; /* weight of the entities this counter refers to */
+	unsigned int num_active; /* nr of active entities with this weight */
+	/*
+	 * Weights tree member (see bfq_data's @queue_weights_tree and
+	 * @group_weights_tree)
+	 */
+	struct rb_node weights_node;
+};
+
+/**
  * struct bfq_entity - schedulable entity.
  *
  * A bfq_entity is used to represent either a bfq_queue (leaf node in the
@@ -212,6 +226,8 @@ struct bfq_sched_data {
 struct bfq_entity {
 	/* service_tree member */
 	struct rb_node rb_node;
+	/* pointer to the weight counter associated with this entity */
+	struct bfq_weight_counter *weight_counter;
 
 	/*
 	 * Flag, true if the entity is on a tree (either the active or
@@ -456,6 +472,25 @@ struct bfq_data {
 	struct bfq_group *root_group;
 
 	/*
+	 * rbtree of weight counters of @bfq_queues, sorted by
+	 * weight. Used to keep track of whether all @bfq_queues have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active and not
+	 * weight-raised @bfq_queue (see the comments to the functions
+	 * bfq_weights_tree_[add|remove] for further details).
+	 */
+	struct rb_root queue_weights_tree;
+	/*
+	 * rbtree of non-queue @bfq_entity weight counters, sorted by
+	 * weight. Used to keep track of whether all @bfq_groups have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active @bfq_group (see
+	 * the comments to the functions bfq_weights_tree_[add|remove]
+	 * for further details).
+	 */
+	struct rb_root group_weights_tree;
+
+	/*
 	 * Number of bfq_queues containing requests (including the
 	 * queue in service, even if it is idling).
 	 */
@@ -791,6 +826,11 @@ struct bfq_group_data {
  *             to avoid too many special cases during group creation/
  *             migration.
  * @stats: stats for this bfqg.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_may_idle()).
  * @rq_pos_tree: rbtree sorted by next_request position, used when
  *               determining if two or more queues have interleaving
  *               requests (see bfq_find_close_cooperator()).
@@ -818,6 +858,8 @@ struct bfq_group {
 
 	struct bfq_entity *my_entity;
 
+	int active_entities;
+
 	struct rb_root rq_pos_tree;
 
 	struct bfqg_stats stats;
@@ -1254,12 +1296,27 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
  * a candidate for next service (i.e, a candidate entity to serve
  * after the in-service entity is expired). The function then returns
  * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
  */
 static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
+	struct bfq_group *bfqg;
+
 	if (bfq_entity_to_bfqq(entity))
 		return true;
 
+	bfqg = container_of(entity, struct bfq_group, entity);
+
+	if (bfqg->active_entities == 1)
+		return true;
+
 	return false;
 }
 
@@ -1498,6 +1555,15 @@ static void bfq_update_active_tree(struct rb_node *node)
 	goto up;
 }
 
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root);
+
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root);
+
+
 /**
  * bfq_active_insert - insert an entity in the active tree of its
  *                     group/device.
@@ -1536,6 +1602,13 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 #endif
 	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else /* bfq_group */
+		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+
+	if (bfqg != bfqd->root_group)
+		bfqg->active_entities++;
+#endif
 }
 
 /**
@@ -1631,6 +1704,14 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 #endif
 	if (bfqq)
 		list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else /* bfq_group */
+		bfq_weights_tree_remove(bfqd, entity,
+					&bfqd->group_weights_tree);
+
+	if (bfqg != bfqd->root_group)
+		bfqg->active_entities--;
+#endif
 }
 
 /**
@@ -1731,6 +1812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned int prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
+		struct rb_root *root;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 		struct bfq_sched_data *sd;
 		struct bfq_group *bfqg;
@@ -1780,7 +1862,26 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		prev_weight = entity->weight;
 		new_weight = entity->orig_weight *
 			     (bfqq ? bfqq->wr_coeff : 1);
+		/*
+		 * If the weight of the entity changes, remove the entity
+		 * from its old weight counter (if there is a counter
+		 * associated with the entity), and add it to the counter
+		 * associated with its new weight.
+		 */
+		if (prev_weight != new_weight) {
+			root = bfqq ? &bfqd->queue_weights_tree :
+				      &bfqd->group_weights_tree;
+			bfq_weights_tree_remove(bfqd, entity, root);
+		}
 		entity->weight = new_weight;
+		/*
+		 * Add the entity to its weights tree only if it is
+		 * not associated with a weight-raised queue.
+		 */
+		if (prev_weight != new_weight &&
+		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+			/* If we get here, root has been initialized. */
+			bfq_weights_tree_add(bfqd, entity, root);
 
 		new_st->wsum += entity->weight;
 
@@ -2606,6 +2707,10 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	bfqd->busy_queues--;
 
+	if (!bfqq->dispatched)
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
 
@@ -2626,6 +2731,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_mark_bfqq_busy(bfqq);
 	bfqd->busy_queues++;
 
+	if (!bfqq->dispatched)
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues++;
 }
@@ -3028,6 +3138,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
 				   * in bfq_init_queue()
 				   */
 	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
 	bfqg->rq_pos_tree = RB_ROOT;
 }
 
@@ -3916,6 +4027,142 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 }
 
 /*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+	/*
+	 * For weights to differ, at least one of the trees must contain
+	 * at least two nodes.
+	 */
+	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	       ) ||
+	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+		(bfqd->group_weights_tree.rb_node->rb_left ||
+		 bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+	       );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+	return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Do not insert if the entity is already associated with a
+	 * counter, which happens if:
+	 *   1) the entity is associated with a queue,
+	 *   2) a request arrival has caused the queue to become both
+	 *      non-weight-raised, and hence change its weight, and
+	 *      backlogged; in this respect, each of the two events
+	 *      causes an invocation of this function,
+	 *   3) this is the invocation of this function caused by the
+	 *      second event. This second invocation is actually useless,
+	 *      and we handle this fact by exiting immediately. More
+	 *      efficient or clearer solutions might possibly be adopted.
+	 */
+	if (entity->weight_counter)
+		return;
+
+	while (*new) {
+		struct bfq_weight_counter *__counter = container_of(*new,
+						struct bfq_weight_counter,
+						weights_node);
+		parent = *new;
+
+		if (entity->weight == __counter->weight) {
+			entity->weight_counter = __counter;
+			goto inc_counter;
+		}
+		if (entity->weight < __counter->weight)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+					 GFP_ATOMIC);
+	entity->weight_counter->weight = entity->weight;
+	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+	rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+	entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root)
+{
+	if (!entity->weight_counter)
+		return;
+
+	entity->weight_counter->num_active--;
+	if (entity->weight_counter->num_active > 0)
+		goto reset_entity_pointer;
+
+	rb_erase(&entity->weight_counter->weights_node, root);
+	kfree(entity->weight_counter);
+
+reset_entity_pointer:
+	entity->weight_counter = NULL;
+}
+
+/*
  * Return expired entry, or NULL to just start from scratch in rbtree.
  */
 static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -5293,13 +5540,17 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	 */
 	sl = bfqd->bfq_slice_idle;
 	/*
-	 * Unless the queue is being weight-raised, grant only minimum
-	 * idle time if the queue is seeky. A long idling is preserved
-	 * for a weight-raised queue, because it is needed for
-	 * guaranteeing to the queue its reserved share of the
-	 * throughput.
-	 */
-	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
+	 * Unless the queue is being weight-raised or the scenario is
+	 * asymmetric, grant only minimum idle time if the queue
+	 * is seeky. A long idling is preserved for a weight-raised
+	 * queue, or, more in general, in an asymmetric scenario,
+	 * because a long idling is needed for guaranteeing to a queue
+	 * its reserved share of the throughput (in particular, it is
+	 * needed if the queue has a higher weight than some other
+	 * queue).
+	 */
+	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+	    bfq_symmetric_scenario(bfqd))
 		sl = min_t(u64, sl, BFQ_MIN_TT);
 
 	bfqd->last_idling_start = ktime_get();
@@ -7197,6 +7448,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		 * mechanism).
 		 */
 		bfqq->budget_timeout = jiffies;
+
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
 	}
 
 	now_ns = ktime_get_ns();
@@ -7627,6 +7881,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		     HRTIMER_MODE_REL);
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
 
+	bfqd->queue_weights_tree = RB_ROOT;
+	bfqd->group_weights_tree = RB_ROOT;
+
 	INIT_LIST_HEAD(&bfqd->active_list);
 	INIT_LIST_HEAD(&bfqd->idle_list);
 
-- 
2.10.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox