[PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
To: Vivek Goyal <vgoyal@redhat.com>, Jens Axboe <axboe@kernel.dk>
Cc: Gui Jianfeng <guijianfeng@cn.fujitsu.com>,
	linux kernel mailing list <linux-kernel@vger.kernel.org>,
	Corrado Zoccolo <czoccolo@gmail.com>,
	Chad Talbott <ctalbott@google.com>,
	Nauman Rafique <nauman@google.com>,
	Divyesh Shah <dpshah@google.com>,
	jmoyer@redhat.com, Shaohua Li <shaohua.li@intel.com>
Subject: [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface
Date: Mon, 27 Dec 2010 16:51:14 +0800	[thread overview]
Message-ID: <4D185382.8080601@cn.fujitsu.com> (raw)
In-Reply-To: <4D180ECE.4000305@cn.fujitsu.com>

CFQ group hierarchical scheduling and use_hierarchy interface.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
 block/blk-cgroup.c  |   61 ++++++-
 block/blk-cgroup.h  |    3 +
 block/cfq-iosched.c |  570 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 469 insertions(+), 165 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..c55fecd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+	.weight = 2*BLKIO_WEIGHT_DEFAULT,
+	.use_hierarchy = 0
+};
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 	blkg->blkcg_id = 0;
 }
 
+
 /*
  * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  * indicating that blk_group was unhashed by the time we got to it.
@@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
+{
+	return blkcg->use_hierarchy;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
+
 uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
 {
 	struct blkio_policy_node *pn;
@@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return (u64)blkcg->weight;
+		case BLKIO_PROP_use_hierarchy:
+			return (u64)blkcg->use_hierarchy;
 		}
 		break;
 	default:
@@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
 	return 0;
 }
 
+static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+	struct cgroup *parent = cgrp->parent;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
+	int ret = 0;
+
+	if (val != 0 && val != 1)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (parent)
+		parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+	cgroup_lock();
+	/*
+	 * If parent's use_hierarchy is set, we can't make any modifications
+	 * in the child subtrees. If it is unset, then the change can occur,
+	 * provided the current cgroup has no children.
+	 */
+	if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
+		if (list_empty(&cgrp->children))
+			blkcg->use_hierarchy = val;
+		else
+			ret = -EBUSY;
+	} else
+		ret = -EINVAL;
+	cgroup_unlock();
+	return ret;
+}
+
 static int
 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
@@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		switch(name) {
 		case BLKIO_PROP_weight:
 			return blkio_weight_write(blkcg, val);
+		case BLKIO_PROP_use_hierarchy:
+			return blkio_use_hierarchy_write(cgrp, val);
 		}
 		break;
 	default:
@@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+	{
+		.name = "use_hierarchy",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+					     BLKIO_PROP_use_hierarchy),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
+	},
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	{
 		.name = "throttle.read_bps_device",
@@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg;
+	struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
@@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
+	parent_blkcg = cgroup_to_blkio_cgroup(parent);
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
@@ -1462,6 +1514,11 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+	if (parent)
+		blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
+	else
+		blkcg->use_hierarchy = 0;
+
 	return &blkcg->css;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..5b4b351 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_idle_time,
 	BLKIO_PROP_empty_time,
 	BLKIO_PROP_dequeue,
+	BLKIO_PROP_use_hierarchy,
 };
 
 /* cgroup files owned by throttle policy */
@@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
+	bool use_hierarchy;
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
@@ -179,6 +181,7 @@ struct blkio_policy_node {
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
 extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f21418..727d6e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -104,6 +104,9 @@ struct cfq_entity {
 	u64 vdisktime;
 	bool is_group_entity;
 	unsigned int weight;
+	struct cfq_entity *parent;
+	/* Reposition time */
+	unsigned long reposition_time;
 };
 
 /*
@@ -112,8 +115,6 @@ struct cfq_entity {
 struct cfq_queue {
 	/* The schedule entity */
 	struct cfq_entity cfqe;
-	/* Reposition time */
-	unsigned long reposition_time;
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
@@ -193,6 +194,9 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* number of sub cfq groups */
+	int nr_subgp;
+
 	/*
 	 * Per group busy queus average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
@@ -228,10 +232,11 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
+	/* cfq group schedule in flat or hierarchy manner. */
+	bool use_hierarchy;
+
 	/*
 	 * The priority currently being served
 	 */
@@ -240,6 +245,9 @@ struct cfq_data {
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 
+	/* Service tree for cfq group flat scheduling mode. */
+	struct cfq_rb_root grp_service_tree;
+
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
@@ -349,8 +357,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
 }
 
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
@@ -640,10 +646,19 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_rb_root *st = cfqe->service_tree;
+	int group_slice = cfq_target_latency;
+
+	/* Calculate group slice in a hierarchical way */
+	do {
+		group_slice = group_slice * cfqe->weight / st->total_weight;
+		cfqe = cfqe->parent;
+		if (cfqe)
+			st = cfqe->service_tree;
+	} while (cfqe);
 
-	return cfq_target_latency * cfqe->weight / st->total_weight;
+	return group_slice;
 }
 
 static inline void
@@ -666,7 +681,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
-				min(slice, base_low_slice * slice / sync_slice);
+				min(slice, base_low_slice * slice /
+				    sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
@@ -806,17 +822,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
-	if (!root->left)
-		root->left = rb_first(&root->rb);
-
-	if (root->left)
-		return rb_entry_entity(root->left);
-
-	return NULL;
-}
-
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
@@ -890,12 +895,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 
 	rb_link_node(&cfqe->rb_node, parent, node);
 	rb_insert_color(&cfqe->rb_node, &st->rb);
+
+	update_min_vdisktime(st);
 }
 
 static void
 cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	__cfq_entity_service_tree_add(st, cfqe);
+	cfqe->reposition_time = jiffies;
 	st->count++;
 	st->total_weight += cfqe->weight;
 }
@@ -903,34 +911,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
-	struct cfq_entity *__cfqe;
 	struct rb_node *n;
+	struct cfq_entity *entity;
+	struct cfq_rb_root *st;
+	struct cfq_group *__cfqg;
 
 	cfqg->nr_cfqq++;
+
 	if (!RB_EMPTY_NODE(&cfqe->rb_node))
 		return;
 
 	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * Enqueue this group and its ancestors onto their service tree.
 	 */
-	n = rb_last(&st->rb);
-	if (n) {
-		__cfqe = rb_entry_entity(n);
-		cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
-	} else
-		cfqe->vdisktime = st->min_vdisktime;
+	while (cfqe) {
+		if (!RB_EMPTY_NODE(&cfqe->rb_node))
+			return;
+
+		/*
+		 * Currently put the group at the end. Later implement
+		 * something so that groups get lesser vtime based on
+		 * their weights, so that if group does not loose all
+		 * if it was not continously backlogged.
+		 */
+		st = cfqe->service_tree;
+		n = rb_last(&st->rb);
+		if (n) {
+			entity = rb_entry_entity(n);
+			cfqe->vdisktime = entity->vdisktime +
+				CFQ_IDLE_DELAY;
+		} else
+			cfqe->vdisktime = st->min_vdisktime;
 
-	cfq_entity_service_tree_add(st, cfqe);
+		cfq_entity_service_tree_add(st, cfqe);
+		cfqe = cfqe->parent;
+		__cfqg = cfqg_of_entity(cfqe);
+		if (__cfqg)
+			__cfqg->nr_subgp++;
+	}
 }
 
 static void
 __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 {
 	cfq_rb_erase(&cfqe->rb_node, st);
+	update_min_vdisktime(st);
 }
 
 static void
@@ -939,27 +965,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
 	if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
 		__cfq_entity_service_tree_del(st, cfqe);
 		st->total_weight -= cfqe->weight;
-		cfqe->service_tree = NULL;
 	}
 }
 
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_entity *cfqe = &cfqg->cfqe;
+	struct cfq_group *__cfqg, *p_cfqg;
 
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
-	/* If there are other cfq queues under this group, don't delete it */
-	if (cfqg->nr_cfqq)
+	/*
+	 * If there are other cfq queues under this group, or there are other
+	 * cfq groups under this group, don't delete it.
+	 */
+	if (cfqg->nr_cfqq || cfqg->nr_subgp)
 		return;
 
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfq_entity_service_tree_del(st, cfqe);
-	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	/*
+	 * Dequeue this group and its ancestors from their service
+	 * tree.
+	 */
+	while (cfqe) {
+		__cfqg = cfqg_of_entity(cfqe);
+		p_cfqg = cfqg_of_entity(cfqe->parent);
+		cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+		__cfqg->saved_workload_slice = 0;
+		cfqe = cfqe->parent;
+		if (p_cfqg) {
+			p_cfqg->nr_subgp--;
+			if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+				return;
+		}
+	}
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -991,7 +1033,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
@@ -1005,10 +1046,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
-	__cfq_entity_service_tree_del(st, cfqe);
-	cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
-	__cfq_entity_service_tree_add(st, cfqe);
+	/*
+	 * Update the vdisktime on the whole chain.
+	 */
+	while (cfqe) {
+		struct cfq_rb_root *st = cfqe->service_tree;
+
+		/*
+		 * Can't update vdisktime while group is on service
+		 * tree.
+		 */
+		__cfq_entity_service_tree_del(st, cfqe);
+		cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+		__cfq_entity_service_tree_add(st, cfqe);
+		st->count++;
+		cfqe->reposition_time = jiffies;
+		cfqe = cfqe->parent;
+	}
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1020,7 +1074,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = 0;
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
-		     cfqe->vdisktime, st->min_vdisktime);
+		     cfqg->cfqe.vdisktime,
+		     cfqg->cfqe.service_tree->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
@@ -1042,35 +1097,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg_of_blkg(blkg)->cfqe.weight = weight;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+				    struct cfq_group *cfqg)
+{
+	struct cfq_entity *cfqe = &cfqg->cfqe;
+
+	cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	RB_CLEAR_NODE(&cfqe->rb_node);
+	cfqe->is_group_entity = true;
+	cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+		      struct cfq_group *cfqg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
-
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg || !create)
-		goto done;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		goto done;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
-	cfqg->cfqe.is_group_entity = true;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1080,24 +1127,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 */
 	atomic_set(&cfqg->ref, 1);
 
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+				    MKDEV(major, minor));
+	/* Initiate group entity */
+	init_cfqe(blkcg, cfqg);
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+		cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+			    struct cfq_group *p_cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	cfqe = &cfqg->cfqe;
+
 	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO. See code above.
+	 * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
+	 *    set, we put this cfqg onto global service tree.
+	 * 2. If cfqg is root cfqg, put it onto global service tree.
 	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					0);
+	if (!p_cfqg) {
+		cfqe->service_tree = &cfqd->grp_service_tree;
+		cfqe->parent = NULL;
+		return;
+	}
 
-	cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	p_cfqe = &p_cfqg->cfqe;
 
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	cfqe->parent = p_cfqe;
+
+	/*
+	 * Currently, just put cfq group entity on "BE:SYNC" workload
+	 * service tree.
+	 */
+	cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+						      SYNC_WORKLOAD);
+	/* child reference */
+	atomic_inc(&p_cfqg->ref);
+}
+
+static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
+{
+	struct cfq_entity *cfqe, *p_cfqe;
+
+	if (!cfqg)
+		return NULL;
+
+	cfqe = &cfqg->cfqe;
+	p_cfqe = cfqe->parent;
+	if (!p_cfqe)
+		return NULL;
+
+	return cfqg_of_entity(p_cfqe);
+}
+
+static struct cfq_group *
+cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+	struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
+	void *key = cfqd;
+
+	/*
+	 * If CGroup's use_hierarchy is unset, we just need to allocate only
+	 * one CFQ group, and this group will put onto the "grp_service_tree".
+	 * We don't need to check whether the cfqg exists, the caller has
+	 * already checked it.
+	 */
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	if (!blkcg_get_use_hierarchy(blkcg)) {
+		cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+				    cfqd->queue->node);
+		if (!cfqg)
+			return NULL;
+
+		init_cfqg(cfqd, blkcg, cfqg);
+		cfqg_set_parent(cfqd, cfqg, NULL);
+		return cfqg;
+	}
+
+	/*
+	 * Allocate the CFQ group chain until we meet the group we'v already
+	 * allocated before, or to the CGroup whose use_hierarchy is not set.
+	 */
+	leaf_cfqg = NULL;
+	child_cfqg = NULL;
+	for (; cgroup != NULL; cgroup = cgroup->parent) {
+		blkcg = cgroup_to_blkio_cgroup(cgroup);
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+		if (cfqg) {
+			if (!cfqg->blkg.dev && bdi->dev &&
+			    dev_name(bdi->dev)) {
+				sscanf(dev_name(bdi->dev), "%u:%u",
+				       &major, &minor);
+				cfqg->blkg.dev = MKDEV(major, minor);
+			}
+
+			/*
+			 * Initialization of parent doesn't finish yet, get
+			 * it done.
+			 */
+			if (child_cfqg) {
+				if (blkcg_get_use_hierarchy(blkcg))
+					cfqg_set_parent(cfqd, child_cfqg,
+							cfqg);
+				else
+					cfqg_set_parent(cfqd, child_cfqg,
+							NULL);
+			}
+
+			/* chain has already been built */
+			break;
+		}
+
+		/*
+		 * We only allocate a cfqg that the corresponding cgroup's
+		 * use_hierarchy is set.
+		 */
+		if (blkcg_get_use_hierarchy(blkcg)) {
+			cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
+					    cfqd->queue->node);
+			if (!cfqg)
+				goto clean_up;
+
+			if (!leaf_cfqg)
+				leaf_cfqg = cfqg;
+
+			init_cfqg(cfqd, blkcg, cfqg);
+		} else {
+			cfqg = NULL;
+		}
+
+		if (child_cfqg)
+			cfqg_set_parent(cfqd, child_cfqg, cfqg);
+
+		/*
+		 * This CGroup's use_hierarchy isn't set, this means the CFQ
+		 * group chain has been built.
+		 */
+		if (!blkcg_get_use_hierarchy(blkcg))
+			break;
+
+		child_cfqg = cfqg;
+	}
+
+	return leaf_cfqg;
+
+clean_up:
+	/* clean up the allocated cfq groups. */
+	while (leaf_cfqg) {
+		tmp_cfqg = leaf_cfqg;
+		leaf_cfqg = cfqg_get_parent(leaf_cfqg);
+		uninit_cfqg(cfqd, tmp_cfqg);
+	}
+
+	return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+	if (cfqg || !create)
+		goto done;
+
+	/*
+	 * Allocate CFQ group chain to the root group or we meet the CGroup
+	 * with use_hierarchy disabled.
+	 */
+	cfqg = cfqg_chain_alloc(cfqd, cgroup);
 
 done:
 	return cfqg;
@@ -1142,13 +1364,26 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
+	struct cfq_group *p_cfqg;
 
 	BUG_ON(atomic_read(&cfqg->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
+
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
-	kfree(cfqg);
+
+	do {
+		p_cfqg = cfqg_get_parent(cfqg);
+		kfree(cfqg);
+		cfqg = NULL;
+		/*
+		 * Drop the reference taken by children, if nobody references
+		 * parent group, we need delete the parent also.
+		 */
+		if (p_cfqg && atomic_dec_and_test(&p_cfqg->ref))
+			cfqg = p_cfqg;
+	} while (cfqg);
 }
 
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
@@ -1356,9 +1591,8 @@ insert:
 	cfqe->service_tree = service_tree;
 
 	/* Add cfqq onto service tree. */
+
 	cfq_entity_service_tree_add(service_tree, cfqe);
-	update_min_vdisktime(service_tree);
-	cfqq->reposition_time = jiffies;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1801,28 +2035,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	return cfqq_of_entity(cfq_rb_first(service_tree));
 }
 
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
 	int i, j;
 	struct cfq_rb_root *st;
 
-	if (!cfqd->rq_queued)
-		return NULL;
+	for_each_cfqg_st(cfqg, i, j, st) {
+		if (st->count != 0)
+			return st;
+	}
 
-	cfqg = cfq_get_next_cfqg(cfqd);
-	if (!cfqg)
+	return NULL;
+}
+
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd)
+{
+	struct cfq_entity *cfqe;
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *cfqg;
+
+	if (!cfqd->rq_queued)
 		return NULL;
 
-	for_each_cfqg_st(cfqg, i, j, st) {
+	do {
 		cfqe = cfq_rb_first(st);
-		if (cfqe != NULL)
-			return cfqq_of_entity(cfqe);
-	}
+		if (cfqe && !cfqe->is_group_entity)
+			return cfqe;
+		else if (cfqe && cfqe->is_group_entity)
+			cfqg = cfqg_of_entity(cfqe);
+
+		st = choose_service_tree_forced(cfqg);
+	} while (st);
+
 	return NULL;
 }
 
+
 /*
  * Get and set a new active queue for service.
  */
@@ -2178,7 +2427,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_entity *cfqe;
-	struct cfq_queue *cfqq;
 	unsigned long lowest_start_time;
 	int i;
 	bool time_valid = false;
@@ -2190,11 +2438,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	 */
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
-		cfqq = cfqq_of_entity(cfqe);
 		if (cfqe && (!time_valid ||
-			     time_before(cfqq->reposition_time,
+			     time_before(cfqe->reposition_time,
 					 lowest_start_time))) {
-			lowest_start_time = cfqq->reposition_time;
+			lowest_start_time = cfqe->reposition_time;
 			cur_best = i;
 			time_valid = true;
 		}
@@ -2203,46 +2450,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	return cur_best;
 }
 
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
-	enum wl_prio_t original_prio = cfqd->serving_prio;
-
-	/* Choose next priority. RT > BE > IDLE */
-	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = RT_WORKLOAD;
-	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = BE_WORKLOAD;
-	else {
-		cfqd->serving_prio = IDLE_WORKLOAD;
-		cfqd->workload_expires = jiffies + 1;
-		return;
-	}
-
-	if (original_prio != cfqd->serving_prio)
-		goto new_workload;
-
-	/*
-	 * For RT and BE, we have to choose also the type
-	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
-	 * expiration time
-	 */
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
-	count = st->count;
-
-	/*
-	 * check workload expiration, and that we still have other queues ready
-	 */
-	if (count && !time_after(jiffies, cfqd->workload_expires))
-		return;
 
-new_workload:
-	/* otherwise select new workload type */
-	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
@@ -2281,38 +2495,63 @@ new_workload:
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
+	/* Restore the previous saved slice. */
+	if (cfqg->saved_workload_slice)
+		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 }
 
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+static struct cfq_rb_root *choose_service_tree(struct cfq_data *cfqd,
+					       struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *cfqg;
-	struct cfq_entity *cfqe;
-
-	if (RB_EMPTY_ROOT(&st->rb))
+	if (!cfqg) {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
 		return NULL;
-	cfqe = cfq_rb_first_entity(st);
-	cfqg = cfqg_of_entity(cfqe);
-	BUG_ON(!cfqg);
-	update_min_vdisktime(st);
-	return cfqg;
+	}
+
+	/* Choose next priority. RT > BE > IDLE */
+	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+		cfqd->serving_prio = RT_WORKLOAD;
+	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+		cfqd->serving_prio = BE_WORKLOAD;
+	else {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
+		return service_tree_for(cfqg, cfqd->serving_prio,
+					cfqd->serving_type);
+	}
+
+	/* otherwise select new workload type */
+	cfqd->serving_type =
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+
+	return service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 }
 
-static void cfq_choose_cfqg(struct cfq_data *cfqd)
+static struct cfq_entity *cfq_select_entity(struct cfq_data *cfqd)
 {
-	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_entity *cfqe;
+	struct cfq_group *cfqg = NULL;
 
-	cfqd->serving_group = cfqg;
+	if (!cfqd->rq_queued)
+		return NULL;
 
-	/* Restore the workload type data */
-	if (cfqg->saved_workload_slice) {
-		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
-		cfqd->serving_type = cfqg->saved_workload;
-		cfqd->serving_prio = cfqg->saved_serving_prio;
-	} else
-		cfqd->workload_expires = jiffies - 1;
+	do {
+		cfqe = cfq_rb_first(st);
+		if (!cfqe->is_group_entity) {
+			set_workload_expire(cfqd, cfqg);
+			cfqd->serving_group = cfqg;
+			return cfqe;
+		} else {
+			cfqg = cfqg_of_entity(cfqe);
+			st = choose_service_tree(cfqd, cfqg);
+			if (!st || RB_EMPTY_ROOT(&st->rb))
+				return NULL;
+		}
+	} while (st);
 
-	choose_service_tree(cfqd, cfqg);
+	return NULL;
 }
 
 /*
@@ -2322,6 +2561,7 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	struct cfq_entity *entity;
 
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
@@ -2421,9 +2661,9 @@ new_queue:
 	 * Current queue expired. Check if we have to switch to a new
 	 * service tree
 	 */
-	if (!new_cfqq)
-		cfq_choose_cfqg(cfqd);
-
+	entity = cfq_select_entity(cfqd);
+	BUG_ON(entity->is_group_entity);
+	new_cfqq = cfqq_of_entity(entity);
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
@@ -2453,10 +2693,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
+	struct cfq_entity *cfqe;
 
 	/* Expire the timeslice of the current active queue first */
 	cfq_slice_expired(cfqd, 0);
-	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+
+	while ((cfqe = cfq_get_next_entity_forced(cfqd)) != NULL) {
+		BUG_ON(cfqe->is_group_entity);
+		cfqq = cfqq_of_entity(cfqe);
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
@@ -2464,6 +2708,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	BUG_ON(cfqd->busy_queues);
 
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+
 	return dispatched;
 }
 
@@ -4008,9 +4253,6 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	cfqd->cic_index = i;
 
-	/* Init root service tree */
-	cfqd->grp_service_tree = CFQ_RB_ROOT;
-
 	/* Init root group */
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
@@ -4020,6 +4262,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	/* Give preference to root group over other groups */
 	cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
 	cfqg->cfqe.is_group_entity = true;
+	cfqg_set_parent(cfqd, cfqg, NULL);
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
@@ -4072,6 +4315,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
+
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
-- 
1.6.5.2

next prev parent reply	other threads:[~2010-12-27  8:51 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <4D180ECE.4000305@cn.fujitsu.com>
2010-12-27  8:50 ` [PATCH 1/6 v3] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
2010-12-27  8:50 ` [PATCH 2/6 v3] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
2011-01-18 21:20   ` Vivek Goyal
2011-01-19  1:25     ` Gui Jianfeng
2011-01-23  2:15     ` Gui Jianfeng
2010-12-27  8:51 ` [PATCH 3/6 v3] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
2010-12-28  2:50   ` Shaohua Li
2010-12-28  3:59     ` Gui Jianfeng
2010-12-28  6:03       ` Shaohua Li
2010-12-28  6:59         ` Gui Jianfeng
2011-01-19 22:58   ` Vivek Goyal
2011-01-20  3:58     ` Gui Jianfeng
2011-01-20 11:09       ` Vivek Goyal
2011-01-24  4:45         ` Gui Jianfeng
2011-01-24 18:51           ` Vivek Goyal
2010-12-27  8:51 ` [PATCH 4/6 v3] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
2010-12-27  8:51 ` Gui Jianfeng [this message]
2011-01-24 22:52   ` [PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface Vivek Goyal
2010-12-27  8:51 ` [PATCH 6/6 v3] blkio-cgroup: Document for blkio.use_hierarchy interface Gui Jianfeng
2011-01-18 20:27   ` Vivek Goyal
2011-01-19  1:20     ` Gui Jianfeng

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:455768a dfblob:c55fecd dfblob:ea4861b dfblob:5b4b351
dfblob:1f21418 dfblob:727d6e6 )
 OR (
bs:"[PATCH 5/6 v3] cfq-iosched: CFQ group hierarchical scheduling and use_hierarchy interface" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D185382.8080601@cn.fujitsu.com \
    --to=guijianfeng@cn.fujitsu.com \
    --cc=axboe@kernel.dk \
    --cc=ctalbott@google.com \
    --cc=czoccolo@gmail.com \
    --cc=dpshah@google.com \
    --cc=jmoyer@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nauman@google.com \
    --cc=shaohua.li@intel.com \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.