All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
To: Vivek Goyal <vgoyal@redhat.com>, Jens Axboe <axboe@kernel.dk>
Cc: Corrado Zoccolo <czoccolo@gmail.com>,
	Chad Talbott <ctalbott@google.com>,
	Nauman Rafique <nauman@google.com>,
	Divyesh Shah <dpshah@google.com>,
	linux kernel mailing list <linux-kernel@vger.kernel.org>,
	Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Subject: [RFC] [PATCH 3/8] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
Date: Mon, 15 Nov 2010 08:53:54 +0800	[thread overview]
Message-ID: <4CE084A2.7010701@cn.fujitsu.com> (raw)

Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
use the same scheduling algorithm as CFQ group does. This helps for CFQ queue and
group scheduling on the same service tree.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/cfq-iosched.c |  194 ++++++++++++++++++++++++++++++++++----------------
 1 files changed, 132 insertions(+), 62 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5cce1e8..ef88931 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -102,10 +102,7 @@ struct io_sched_entity {
 	struct cfq_rb_root *service_tree;
 	/* service_tree member */
 	struct rb_node rb_node;
-	/* service_tree key, represent the position on the tree */
-	unsigned long rb_key;
-
-	/* group service_tree key */
+	/* service_tree key */
 	u64 vdisktime;
 	bool on_st;
 	bool is_group_entity;
@@ -118,6 +115,8 @@ struct io_sched_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct io_sched_entity queue_entity;
+	/* Reposition time */
+	unsigned long reposition_time;
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
@@ -306,6 +305,22 @@ struct cfq_data {
 	struct rcu_head rcu;
 };
 
+/*
+ * Map io priority(7 ~ 0) to io weight(100 ~ 1000)
+ */
+static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
+{
+	unsigned int step;
+
+	BUG_ON(ioprio >= IOPRIO_BE_NR);
+
+	step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
+	if (ioprio == 0)
+		return BLKIO_WEIGHT_MAX;
+
+	return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
+}
+
 static inline struct cfq_queue *
 cfqq_of_entity(struct io_sched_entity *io_entity)
 {
@@ -551,12 +566,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64
+cfq_scale_slice(unsigned long delta, struct io_sched_entity *entity)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
 	d = d * BLKIO_WEIGHT_DEFAULT;
-	do_div(d, cfqg->group_entity.weight);
+	do_div(d, entity->weight);
 	return d;
 }
 
@@ -581,16 +597,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
-	struct io_sched_entity *group_entity;
+	struct io_sched_entity *entity;
 
 	if (st->active) {
-		group_entity = rb_entry_entity(st->active);
-		vdisktime = group_entity->vdisktime;
+		entity = rb_entry_entity(st->active);
+		vdisktime = entity->vdisktime;
 	}
 
 	if (st->left) {
-		group_entity = rb_entry_entity(st->left);
-		vdisktime = min_vdisktime(vdisktime, group_entity->vdisktime);
+		entity = rb_entry_entity(st->left);
+		vdisktime = min_vdisktime(vdisktime, entity->vdisktime);
 	}
 
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -838,16 +854,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 
-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
-				      struct cfq_queue *cfqq)
-{
-	/*
-	 * just an approximation, should be ok.
-	 */
-	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
-		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
 static inline s64
 entity_key(struct cfq_rb_root *st, struct io_sched_entity *entity)
 {
@@ -983,7 +989,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&group_entity->rb_node, st);
-	group_entity->vdisktime += cfq_scale_slice(charge, cfqg);
+	group_entity->vdisktime += cfq_scale_slice(charge, group_entity);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -1214,13 +1220,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct io_sched_entity *queue_entity;
 	struct rb_node **p, *parent;
 	struct io_sched_entity *__queue_entity;
-	unsigned long rb_key;
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *service_tree, *orig_st;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
+	s64 key;
 
 	queue_entity = &cfqq->queue_entity;
+	orig_st = queue_entity->service_tree;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
@@ -1228,8 +1235,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&queue_entity->rb_node))
+		if (!RB_EMPTY_NODE(&queue_entity->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&queue_entity->rb_node,
+				     queue_entity->service_tree);
+			orig_st->total_weight -= queue_entity->weight;
+		}
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
 		atomic_inc(&cfqd->root_group.ref);
@@ -1238,8 +1253,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&queue_entity->rb_node))
+		if (!RB_EMPTY_NODE(&queue_entity->rb_node)) {
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+			/*
+			 * Group changed, dequeue this CFQ queue from the
+			 * original service tree.
+			 */
+			cfq_rb_erase(&queue_entity->rb_node,
+				     queue_entity->service_tree);
+			orig_st->total_weight -= queue_entity->weight;
+		}
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
 		cfqq->orig_cfqg = NULL;
@@ -1250,50 +1273,67 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
+
+	/*
+	 * For the time being, put the newly added CFQ queue at the end of the
+	 * service tree.
+	 */
+	if (RB_EMPTY_NODE(&queue_entity->rb_node)) {
+		/*
+		 * If this CFQ queue moves to another group, the original
+		 * vdisktime makes no sense any more, reset the vdisktime
+		 * here.
+		 */
+		parent = rb_last(&service_tree->rb);
+		if (parent) {
+			__queue_entity = rb_entry_entity(parent);
+			queue_entity->vdisktime = __queue_entity->vdisktime +
+						  CFQ_IDLE_DELAY;
+		} else
+			queue_entity->vdisktime = service_tree->min_vdisktime;
+
+		goto insert;
+	}
+
+	/*
+	 * Ok, we get here, this CFQ queue is on the service tree, dequeue it
+	 * firstly.
+	 */
+	cfq_rb_erase(&queue_entity->rb_node,
+		     queue_entity->service_tree);
+	orig_st->total_weight -= queue_entity->weight;
+
+	new_cfqq = 0;
 	if (cfq_class_idle(cfqq)) {
-		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &queue_entity->rb_node) {
 			__queue_entity = rb_entry(parent,
 						  struct io_sched_entity,
 						  rb_node);
-			rb_key += __queue_entity->rb_key;
+			queue_entity->vdisktime = __queue_entity->vdisktime +
+						  CFQ_IDLE_DELAY;
 		} else
-			rb_key += jiffies;
+			queue_entity->vdisktime = service_tree->min_vdisktime;
 	} else if (!add_front) {
 		/*
-		 * Get our rb key offset. Subtract any residual slice
-		 * value carried from last service. A negative resid
-		 * count indicates slice overrun, and this should position
-		 * the next service time further away in the tree.
-		 */
-		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
-		rb_key -= cfqq->slice_resid;
-		cfqq->slice_resid = 0;
-	} else {
-		rb_key = -HZ;
-		__queue_entity = cfq_rb_first(service_tree);
-		rb_key += __queue_entity ? __queue_entity->rb_key : jiffies;
-	}
-
-	if (!RB_EMPTY_NODE(&queue_entity->rb_node)) {
-		new_cfqq = 0;
-		/*
-		 * same position, nothing more to do
+		 * We charge the CFQ queue by the time this queue runs, and
+		 * repsition it on the service tree.
 		 */
-		if (rb_key == queue_entity->rb_key &&
-		    queue_entity->service_tree == service_tree)
-			return;
+		unsigned int used_sl;
 
-		cfq_rb_erase(&queue_entity->rb_node,
-			     queue_entity->service_tree);
-		queue_entity->service_tree = NULL;
+		used_sl = cfq_cfqq_slice_usage(cfqq);
+		queue_entity->vdisktime += cfq_scale_slice(used_sl,
+							   queue_entity);
+	} else {
+		queue_entity->vdisktime = service_tree->min_vdisktime;
 	}
 
+insert:
 	left = 1;
 	parent = NULL;
 	queue_entity->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
+	key = entity_key(service_tree, queue_entity);
 	while (*p) {
 		struct rb_node **n;
 
@@ -1304,7 +1344,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * sort by key, that represents service time.
 		 */
-		if (time_before(rb_key, __queue_entity->rb_key))
+		if (key < entity_key(service_tree, __queue_entity))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
@@ -1317,10 +1357,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (left)
 		service_tree->left = &queue_entity->rb_node;
 
-	queue_entity->rb_key = rb_key;
 	rb_link_node(&queue_entity->rb_node, parent, p);
 	rb_insert_color(&queue_entity->rb_node, &service_tree->rb);
+	update_min_vdisktime(service_tree);
 	service_tree->count++;
+	service_tree->total_weight += queue_entity->weight;
+	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1422,15 +1464,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct io_sched_entity *queue_entity;
+	struct cfq_rb_root *service_tree;
+
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
 	queue_entity = &cfqq->queue_entity;
+	service_tree = queue_entity->service_tree;
 
 	if (!RB_EMPTY_NODE(&queue_entity->rb_node)) {
 		cfq_rb_erase(&queue_entity->rb_node,
 			     queue_entity->service_tree);
+		service_tree->total_weight -= queue_entity->weight;
 		queue_entity->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
@@ -2132,24 +2178,35 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
+/*
+ * The time when a CFQ queue is put onto a service tree is recoreded in
+ * cfqq->reposition_time. Currently, we check the first priority CFQ queues
+ * on each service tree, and select the workload type that contain the lowest
+ * reposition_time CFQ queue among them.
+ */
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct io_sched_entity *queue_entity;
+	struct cfq_queue *cfqq;
+	unsigned long lowest_start_time;
 	int i;
-	bool key_valid = false;
-	unsigned long lowest_key = 0;
+	bool time_valid = false;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 
+	/*
+	 * TODO: We may take io priority into account when choosing a workload
+	 * type. But for the time being just make use of reposition_time only.
+	 */
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
-		/* select the one with lowest rb_key */
 		queue_entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
+		cfqq = cfqq_of_entity(queue_entity);
 		if (queue_entity &&
-		    (!key_valid ||
-		     time_before(queue_entity->rb_key, lowest_key))) {
-			lowest_key = queue_entity->rb_key;
+		    (!time_valid ||
+		     cfqq->reposition_time < lowest_start_time)) {
+			lowest_start_time = cfqq->reposition_time;
 			cur_best = i;
-			key_valid = true;
+			time_valid = true;
 		}
 	}
 
@@ -2811,10 +2868,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
+	struct io_sched_entity *queue_entity;
 
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
+	queue_entity = &cfqq->queue_entity;
+
 	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 	default:
@@ -2841,6 +2901,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		break;
 	}
 
+	queue_entity->weight = cfq_prio_to_weight(cfqq->ioprio);
+
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
@@ -3571,6 +3633,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
+	struct io_sched_entity *queue_entity;
+
+	queue_entity = &cfqq->queue_entity;
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
@@ -3587,6 +3652,11 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	}
+
+	/*
+	 * update the io weight if io priority gets changed.
+	 */
+	queue_entity->weight = cfq_prio_to_weight(cfqq->ioprio);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
-- 
1.6.5.2






-- 
Regards
Gui Jianfeng

             reply	other threads:[~2010-11-15  0:54 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-11-15  0:53 Gui Jianfeng [this message]
     [not found] <4CDF7BC5.9080803@cn.fujitsu.com>
     [not found] ` <4CDF9CD8.8010207@cn.fujitsu.com>
     [not found]   ` <20101115193352.GB3396@redhat.com>
2010-11-29  2:32     ` [RFC] [PATCH 3/8] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4CE084A2.7010701@cn.fujitsu.com \
    --to=guijianfeng@cn.fujitsu.com \
    --cc=axboe@kernel.dk \
    --cc=ctalbott@google.com \
    --cc=czoccolo@gmail.com \
    --cc=dpshah@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nauman@google.com \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.