linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>, lkml <linux-kernel@vger.kernel.org>,
	Li Zefan <lizefan@huawei.com>,
	containers@lists.linux-foundation.org,
	Cgroups <cgroups@vger.kernel.org>
Subject: Re: [PATCHSET] blk-throttle: implement proper hierarchy support
Date: Fri, 3 May 2013 16:54:55 -0700	[thread overview]
Message-ID: <20130503235455.GE22860@mtj.dyndns.org> (raw)
In-Reply-To: <20130503210513.GE6062@redhat.com>

Hey,

On Fri, May 03, 2013 at 05:05:13PM -0400, Vivek Goyal wrote:
> Following inline patch implements transferring child's start time to 
> parent, if parent slice had expired at the time of bio migration.
> 
> I does seem to help a lot on my machine. Can you please give it a try.

Cool, will give it a try.  Can you please make it a proper patch with
SOB?  Please feel free to base it on top of the series.  It can
probably go right below the final patch but the rebase there should be
trivial.

> I think even this approach is flawed. If there are multiple children,
> it might happen that one child pushes up the bio early (as it is smaller
> size bio or group rate is high). And later other child pushes up the bio
> (bio was big or group limit was slow). We still lost time and effective
> rate will still be lower than anticipated.

Hmmm... I see.  Well, as long as it behaves acceptably on three level
nesting, I think it's good for now.

Here's RR dispatch patch I'm working on now.  I think it's almost
ready now.  I'll go over it one more time, split and merge it into the
patch series.

Thanks.

Index: work/block/blk-throttle.c
===================================================================
--- work.orig/block/blk-throttle.c
+++ work/block/blk-throttle.c
@@ -26,6 +26,29 @@ static struct blkcg_policy blkcg_policy_
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued.  When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from.  When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ */
+struct throtl_qnode {
+	struct list_head	node;		/* service_queue->queued[] */
+	struct bio_list		bios;		/* queued bios */
+	struct throtl_grp	*tg;		/* tg this qnode belongs to */
+};
+
 struct throtl_service_queue {
 	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
 
@@ -33,7 +56,7 @@ struct throtl_service_queue {
 	 * Bios queued directly to this service_queue or dispatched from
 	 * children throtl_grp's.
 	 */
-	struct bio_list		bio_lists[2];	/* queued bios [READ/WRITE] */
+	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
 	unsigned int		nr_queued[2];	/* number of queued bios */
 
 	/*
@@ -76,6 +99,17 @@ struct throtl_grp {
 	struct throtl_service_queue service_queue;
 
 	/*
+	 * qnode_on_self is used when bios are directly queued to this
+	 * throtl_grp so that local bios compete fairly with bios
+	 * dispatched from children.  qnode_on_parent is used when bios are
+	 * dispatched from this this throtl_grp into its parent and will
+	 * compete with sibling qnode_on_parents and the parent's
+	 * qnode_on_self.
+	 */
+	struct throtl_qnode qnode_on_self;
+	struct throtl_qnode qnode_on_parent;
+
+	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
 	 * key to sort active groups in service tree.
@@ -250,12 +284,81 @@ alloc_stats:
 		goto alloc_stats;
 }
 
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+{
+	INIT_LIST_HEAD(&qn->node);
+	bio_list_init(&qn->bios);
+	qn->tg = tg;
+}
+
+/**
+ * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
+ * @bio: bio being added
+ * @qn: qnode to add bio to
+ * @queued: the service_queue->queued[] list @qn belongs to
+ */
+static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
+				 struct list_head *queued)
+{
+	bio_list_add(&qn->bios, bio);
+	if (list_empty(&qn->node)) {
+		list_add_tail(&qn->node, queued);
+		blkg_get(tg_to_blkg(qn->tg));
+	}
+}
+
+/**
+ * throtl_peek_queued - peek the first bio on a qnode list
+ * @queued: the qnode list to peek
+ */
+static struct bio *throtl_peek_queued(struct list_head *queued)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_peek(&qn->bios);
+	WARN_ON_ONCE(!bio);
+	return bio;
+}
+
+/**
+ * throtl_pop_queued - pop the first bio form a qnode list
+ * @queued: the qnode list to pop a bio from
+ *
+ * Pop the first bio from the qnode list @queued.  After popping, the first
+ * qnode is removed from @queued if empty or moved to the end of @queued so
+ * that the popping order is round-robin.
+ */
+static struct bio *throtl_pop_queued(struct list_head *queued)
+{
+	struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+	struct bio *bio;
+
+	if (list_empty(queued))
+		return NULL;
+
+	bio = bio_list_pop(&qn->bios);
+	WARN_ON_ONCE(!bio);
+
+	if (bio_list_empty(&qn->bios)) {
+		list_del_init(&qn->node);
+		blkg_put(tg_to_blkg(qn->tg));
+	} else {
+		list_move_tail(&qn->node, queued);
+	}
+
+	return bio;
+}
+
 /* init a service_queue, assumes the caller zeroed it */
 static void throtl_service_queue_init(struct throtl_service_queue *sq,
 				      struct throtl_service_queue *parent_sq)
 {
-	bio_list_init(&sq->bio_lists[0]);
-	bio_list_init(&sq->bio_lists[1]);
+	INIT_LIST_HEAD(&sq->queued[0]);
+	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
 	sq->parent_sq = parent_sq;
 	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
@@ -293,6 +396,9 @@ static void throtl_pd_init(struct blkcg_
 		parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 
 	throtl_service_queue_init(&tg->service_queue, parent_sq);
+	throtl_qnode_init(&tg->qnode_on_self, tg);
+	throtl_qnode_init(&tg->qnode_on_parent, tg);
+
 	RB_CLEAR_NODE(&tg->rb_node);
 	tg->td = td;
 
@@ -752,7 +858,7 @@ static bool tg_may_dispatch(struct throt
 	 * queued.
 	 */
 	BUG_ON(tg->service_queue.nr_queued[rw] &&
-	       bio != bio_list_peek(&tg->service_queue.bio_lists[rw]));
+	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
 
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -843,11 +949,24 @@ static void throtl_charge_bio(struct thr
 	}
 }
 
-static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
+/**
+ * throtl_add_bio_tg - add a bio to the specified throtl_grp
+ * @bio: bio to add
+ * @qn: qnode to use
+ * @tg: the target throtl_grp
+ *
+ * Add @bio to @tg's service_queue using @qn.  If @qn is not specified,
+ * tg->qnode_on_self is used.
+ */
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
+			      struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	bool rw = bio_data_dir(bio);
 
+	if (!qn)
+		qn = &tg->qnode_on_self;
+
 	/*
 	 * If @tg doesn't currently have any bios queued in the same
 	 * direction, queueing @bio can change when @tg should be
@@ -857,9 +976,8 @@ static void throtl_add_bio_tg(struct bio
 	if (!sq->nr_queued[rw])
 		tg->flags |= THROTL_TG_WAS_EMPTY;
 
-	bio_list_add(&sq->bio_lists[rw], bio);
-	/* Take a bio reference on tg */
-	blkg_get(tg_to_blkg(tg));
+	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+
 	sq->nr_queued[rw]++;
 	throtl_enqueue_tg(tg);
 }
@@ -870,10 +988,10 @@ static void tg_update_disptime(struct th
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 
-	if ((bio = bio_list_peek(&sq->bio_lists[READ])))
+	if ((bio = throtl_peek_queued(&sq->queued[READ])))
 		tg_may_dispatch(tg, bio, &read_wait);
 
-	if ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
+	if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
 		tg_may_dispatch(tg, bio, &write_wait);
 
 	min_wait = min(read_wait, write_wait);
@@ -895,7 +1013,7 @@ static void tg_dispatch_one_bio(struct t
 	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
 	struct bio *bio;
 
-	bio = bio_list_pop(&sq->bio_lists[rw]);
+	bio = throtl_pop_queued(&sq->queued[rw]);
 	sq->nr_queued[rw]--;
 
 	throtl_charge_bio(tg, bio);
@@ -908,17 +1026,15 @@ static void tg_dispatch_one_bio(struct t
 	 * responsible for issuing these bios.
 	 */
 	if (parent_tg) {
-		throtl_add_bio_tg(bio, parent_tg);
+		throtl_add_bio_tg(bio, &tg->qnode_on_parent, parent_tg);
 	} else {
-		bio_list_add(&parent_sq->bio_lists[rw], bio);
+		throtl_qnode_add_bio(bio, &tg->qnode_on_parent,
+				     &parent_sq->queued[rw]);
 		BUG_ON(tg->td->nr_queued[rw] <= 0);
 		tg->td->nr_queued[rw]--;
 	}
 
 	throtl_trim_slice(tg, rw);
-
-	/* @bio is transferred to parent, drop its blkg reference */
-	blkg_put(tg_to_blkg(tg));
 }
 
 static int throtl_dispatch_tg(struct throtl_grp *tg)
@@ -931,7 +1047,7 @@ static int throtl_dispatch_tg(struct thr
 
 	/* Try to dispatch 75% READS and 25% WRITES */
 
-	while ((bio = bio_list_peek(&sq->bio_lists[READ])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -941,7 +1057,7 @@ static int throtl_dispatch_tg(struct thr
 			break;
 	}
 
-	while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
 		tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -1076,10 +1192,9 @@ void blk_throtl_dispatch_work_fn(struct
 	bio_list_init(&bio_list_on_stack);
 
 	spin_lock_irq(q->queue_lock);
-	for (rw = READ; rw <= WRITE; rw++) {
-		bio_list_merge(&bio_list_on_stack, &td_sq->bio_lists[rw]);
-		bio_list_init(&td_sq->bio_lists[rw]);
-	}
+	for (rw = READ; rw <= WRITE; rw++)
+		while ((bio = throtl_pop_queued(&td_sq->queued[rw])))
+			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
@@ -1295,6 +1410,7 @@ static struct blkcg_policy blkcg_policy_
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
+	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg;
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
@@ -1362,6 +1478,7 @@ bool blk_throtl_bio(struct request_queue
 		 * Climb up the ladder.  If we''re already at the top, it
 		 * can be executed directly.
 		 */
+		qn = &tg->qnode_on_parent;
 		sq = sq->parent_sq;
 		tg = sq_to_tg(sq);
 		if (!tg)
@@ -1377,7 +1494,7 @@ bool blk_throtl_bio(struct request_queue
 
 	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
-	throtl_add_bio_tg(bio, tg);
+	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
 	/*
@@ -1421,9 +1538,9 @@ static void tg_drain_bios(struct throtl_
 
 		throtl_dequeue_tg(tg);
 
-		while ((bio = bio_list_peek(&sq->bio_lists[READ])))
+		while ((bio = throtl_peek_queued(&sq->queued[READ])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio));
-		while ((bio = bio_list_peek(&sq->bio_lists[WRITE])))
+		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
 			tg_dispatch_one_bio(tg, bio_data_dir(bio));
 	}
 }
@@ -1465,7 +1582,7 @@ void blk_throtl_drain(struct request_que
 
 	/* all bios now should be in td->service_queue, issue them */
 	for (rw = READ; rw <= WRITE; rw++)
-		while ((bio = bio_list_pop(&td->service_queue.bio_lists[rw])))
+		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw])))
 			generic_make_request(bio);
 
 	spin_lock_irq(q->queue_lock);

  reply	other threads:[~2013-05-03 23:55 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-05-02  0:39 [PATCHSET] blk-throttle: implement proper hierarchy support Tejun Heo
2013-05-02  0:39 ` [PATCH 01/31] blkcg: fix error return path in blkg_create() Tejun Heo
2013-05-02  0:39 ` [PATCH 02/31] blkcg: move blkg_for_each_descendant_pre() to block/blk-cgroup.h Tejun Heo
2013-05-02  0:39 ` [PATCH 03/31] blkcg: implement blkg_for_each_descendant_post() Tejun Heo
2013-05-02  0:39 ` [PATCH 04/31] blkcg: invoke blkcg_policy->pd_init() after parent is linked Tejun Heo
2013-05-02  0:39 ` [PATCH 05/31] blkcg: move bulk of blkcg_gq release operations to the RCU callback Tejun Heo
2013-05-02  0:39 ` [PATCH 06/31] blk-throttle: remove spurious throtl_enqueue_tg() call from throtl_select_dispatch() Tejun Heo
2013-05-02  0:39 ` [PATCH 07/31] blk-throttle: removed deferred config application mechanism Tejun Heo
2013-05-02 14:49   ` Vivek Goyal
2013-05-02 17:27     ` Tejun Heo
2013-05-02  0:39 ` [PATCH 08/31] blk-throttle: collapse throtl_dispatch() into the work function Tejun Heo
2013-05-02  0:39 ` [PATCH 09/31] blk-throttle: relocate throtl_schedule_delayed_work() Tejun Heo
2013-05-02  0:39 ` [PATCH 10/31] blk-throttle: remove pointless throtl_nr_queued() optimizations Tejun Heo
2013-05-02  0:39 ` [PATCH 11/31] blk-throttle: rename throtl_rb_root to throtl_service_queue Tejun Heo
2013-05-02  0:39 ` [PATCH 12/31] blk-throttle: simplify throtl_grp flag handling Tejun Heo
2013-05-02  0:39 ` [PATCH 13/31] blk-throttle: add backlink pointer from throtl_grp to throtl_data Tejun Heo
2013-05-02  0:39 ` [PATCH 14/31] blk-throttle: pass around throtl_service_queue instead of throtl_data Tejun Heo
2013-05-02  0:39 ` [PATCH 15/31] blk-throttle: reorganize throtl_service_queue passed around as argument Tejun Heo
2013-05-02 15:21   ` Vivek Goyal
2013-05-02 17:29     ` Tejun Heo
2013-05-02  0:39 ` [PATCH 16/31] blk-throttle: add throtl_grp->service_queue Tejun Heo
2013-05-02  0:39 ` [PATCH 17/31] blk-throttle: move bio_lists[] and friends to throtl_service_queue Tejun Heo
2013-05-02  0:39 ` [PATCH 18/31] blk-throttle: dispatch to throtl_data->service_queue.bio_lists[] Tejun Heo
2013-05-02  0:39 ` [PATCH 19/31] blk-throttle: generalize update_disptime optimization in blk_throtl_bio() Tejun Heo
2013-05-02  0:39 ` [PATCH 20/31] blk-throttle: add throtl_service_queue->parent_sq Tejun Heo
2013-05-02  0:39 ` [PATCH 21/31] blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log() Tejun Heo
2013-05-06 17:36   ` Vivek Goyal
2013-05-06 18:38     ` Tejun Heo
2013-05-06 20:38     ` Tejun Heo
2013-05-06 20:39       ` Tejun Heo
2013-05-06 20:41       ` Vivek Goyal
2013-05-06 20:43         ` Tejun Heo
2013-05-02  0:39 ` [PATCH 22/31] blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it Tejun Heo
2013-05-02  0:39 ` [PATCH 23/31] blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work Tejun Heo
2013-05-02  0:39 ` [PATCH 24/31] blk-throttle: implement dispatch looping Tejun Heo
2013-05-02  0:39 ` [PATCH 25/31] blk-throttle: dispatch from throtl_pending_timer_fn() Tejun Heo
2013-05-02  0:39 ` [PATCH 26/31] blk-throttle: make blk_throtl_drain() ready for hierarchy Tejun Heo
2013-05-02  0:39 ` [PATCH 27/31] blk-throttle: make blk_throtl_bio() " Tejun Heo
2013-05-02  0:39 ` [PATCH 28/31] blk-throttle: make tg_dispatch_one_bio() " Tejun Heo
2013-05-02  0:39 ` [PATCH 29/31] blk-throttle: make throtl_pending_timer_fn() " Tejun Heo
2013-05-02  0:39 ` [PATCH 30/31] blk-throttle: implement throtl_grp->has_rules[] Tejun Heo
2013-05-02  0:39 ` [PATCH 31/31] blk-throttle: implement proper hierarchy support Tejun Heo
2013-05-02 17:34 ` [PATCHSET] " Vivek Goyal
2013-05-02 17:57   ` Tejun Heo
2013-05-02 18:17     ` Vivek Goyal
2013-05-02 18:29       ` Tejun Heo
2013-05-02 18:45         ` Vivek Goyal
2013-05-02 18:49           ` Tejun Heo
2013-05-02 19:07             ` Vivek Goyal
2013-05-02 19:11               ` Tejun Heo
2013-05-02 19:31                 ` Vivek Goyal
2013-05-02 23:13                   ` Tejun Heo
2013-05-03 17:56                     ` Vivek Goyal
2013-05-03 18:57                       ` Tejun Heo
2013-05-03 18:58                         ` Tejun Heo
2013-05-03 19:08                         ` Vivek Goyal
2013-05-03 19:14                           ` Tejun Heo
2013-05-03 19:26                             ` Vivek Goyal
2013-05-03 21:05                             ` Vivek Goyal
2013-05-03 23:54                               ` Tejun Heo [this message]
2013-05-06 17:33                                 ` Vivek Goyal
2013-05-02 18:08   ` Vivek Goyal
2013-05-02 18:44     ` Tejun Heo
2013-05-02 18:59       ` Vivek Goyal
2013-05-04  0:50 ` [PATCH 29.5/32] blk-throttle: add throtl_qnode for dispatch fairness Tejun Heo
2013-05-04  0:53   ` Tejun Heo
2013-05-06 16:00   ` Vivek Goyal
2013-05-06 18:35     ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130503235455.GE22860@mtj.dyndns.org \
    --to=tj@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=cgroups@vger.kernel.org \
    --cc=containers@lists.linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lizefan@huawei.com \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).