* [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support
@ 2010-08-30 6:50 Gui Jianfeng
2010-08-30 18:20 ` Chad Talbott
2010-08-30 20:36 ` Vivek Goyal
0 siblings, 2 replies; 19+ messages in thread
From: Gui Jianfeng @ 2010-08-30 6:50 UTC (permalink / raw)
To: Vivek Goyal, Jens Axboe
Cc: Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique,
Gui Jianfeng, linux kernel mailing list
Hi All,
This patch enables cfq group hierarchical scheduling.
With this patch, you can create a cgroup directory deeper than level 1.
Now, I/O Bandwidth is distributed in a hierarchy way. For example:
We create cgroup directories as following(the number represents weight):
Root grp
/ \
grp_1(100) grp_2(400)
/ \
grp_3(200) grp_4(300)
If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth,
grp_2 will share 80% of total bandwidth.
For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%)
Design:
o Each cfq group has its own group service tree.
o Each cfq group contains a "group schedule entity" (gse) that
schedules on parent cfq group's service tree.
o Each cfq group contains a "queue schedule entity"(qse), it
represents all cfqqs located on this cfq group. It schedules
on this group's service tree. For the time being, root group
qse's weight is 1000, and subgroup qse's weight is 500.
o All gses and qse which belones to a same cfq group schedules
on the same group service tree.
o cfq group allocates in a recursive manner, that means when a cfq
group needs to be allocated, the upper level cfq groups are also
allocated.
o When a cfq group served, not only charge this cfq group but also
charge its ancestors.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/blk-cgroup.c | 4 -
block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++-------------
2 files changed, 353 insertions(+), 128 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef..aa06cb3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
goto done;
}
- /* Currently we do not support hierarchy deeper than two level (0,1) */
- if (parent != cgroup->top_cgroup)
- return ERR_PTR(-EPERM);
-
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f65c6f0..4660d89 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
-#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+#define rb_entry_se(node) \
+ rb_entry((node), struct io_sched_entity, rb_node)
/*
* Most of our rbtree usage is for sorting with min extraction, so
@@ -171,19 +172,40 @@ enum wl_type_t {
SYNC_WORKLOAD = 2
};
-/* This is per cgroup per device grouping structure */
-struct cfq_group {
+/*
+ * This's the schedule entity which is scheduled on group service tree.
+ * It represents shedule structure of a cfq group, or a bundle of cfq
+ * queues that locate in a cfq group.
+ */
+struct io_sched_entity {
+ struct cfq_rb_root *st;
+
/* group service_tree member */
struct rb_node rb_node;
-
- /* group service_tree key */
u64 vdisktime;
unsigned int weight;
bool on_st;
+ bool is_group_se;
+ struct io_sched_entity *parent;
+};
+
+/* This is per cgroup per device grouping structure */
+struct cfq_group {
+ /* cfq group sched entity */
+ struct io_sched_entity group_se;
+
+ /* sched entity for cfqqs */
+ struct io_sched_entity queue_se;
+
+ /* Service tree for cfq_groups and cfqqs set*/
+ struct cfq_rb_root grp_service_tree;
/* number of cfqq currently on this group */
int nr_cfqq;
+ /* number of sub cfq groups */
+ int nr_subgp;
+
/* Per group busy queus average. Useful for workload slice calc. */
unsigned int busy_queues_avg[2];
/*
@@ -210,8 +232,6 @@ struct cfq_group {
*/
struct cfq_data {
struct request_queue *queue;
- /* Root service tree for cfq_groups */
- struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;
/*
@@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd)
return false;
}
+static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se)
+{
+ if (se->is_group_se)
+ return container_of(se, struct cfq_group, group_se);
+ else
+ return NULL;
+}
+
+static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se)
+{
+ if (!se->is_group_se)
+ return container_of(se, struct cfq_group, queue_se);
+ else
+ return NULL;
+}
+
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
{
if (cfq_class_idle(cfqq))
@@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64 cfq_scale_slice(unsigned long delta,
+ struct io_sched_entity *se)
{
u64 d = delta << CFQ_SERVICE_SHIFT;
d = d * BLKIO_WEIGHT_DEFAULT;
- do_div(d, cfqg->weight);
+ do_div(d, se->weight);
return d;
}
@@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
static void update_min_vdisktime(struct cfq_rb_root *st)
{
u64 vdisktime = st->min_vdisktime;
- struct cfq_group *cfqg;
+ struct io_sched_entity *se;
if (st->active) {
- cfqg = rb_entry_cfqg(st->active);
- vdisktime = cfqg->vdisktime;
+ se = rb_entry_se(st->active);
+ vdisktime = se->vdisktime;
}
if (st->left) {
- cfqg = rb_entry_cfqg(st->left);
- vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+ se = rb_entry_se(st->left);
+ vdisktime = min_vdisktime(vdisktime, se->vdisktime);
}
st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct io_sched_entity *gse = &cfqg->queue_se;
+ struct cfq_rb_root *st = gse->st;
- return cfq_target_latency * cfqg->weight / st->total_weight;
+ return cfq_target_latency * gse->weight / st->total_weight;
}
static inline void
@@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root)
{
if (!root->left)
root->left = rb_first(&root->rb);
if (root->left)
- return rb_entry_cfqg(root->left);
+ return rb_entry_se(root->left);
return NULL;
}
@@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
}
static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+se_key(struct cfq_rb_root *st, struct io_sched_entity *se)
{
- return cfqg->vdisktime - st->min_vdisktime;
+ return se->vdisktime - st->min_vdisktime;
}
static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se)
{
struct rb_node **node = &st->rb.rb_node;
struct rb_node *parent = NULL;
- struct cfq_group *__cfqg;
- s64 key = cfqg_key(st, cfqg);
+ struct io_sched_entity *__se;
+ s64 key = se_key(st, se);
int left = 1;
while (*node != NULL) {
parent = *node;
- __cfqg = rb_entry_cfqg(parent);
+ __se = rb_entry_se(parent);
- if (key < cfqg_key(st, __cfqg))
+ if (key < se_key(st, __se))
node = &parent->rb_left;
else {
node = &parent->rb_right;
@@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
}
if (left)
- st->left = &cfqg->rb_node;
+ st->left = &se->rb_node;
- rb_link_node(&cfqg->rb_node, parent, node);
- rb_insert_color(&cfqg->rb_node, &st->rb);
+ rb_link_node(&se->rb_node, parent, node);
+ rb_insert_color(&se->rb_node, &st->rb);
}
-static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void io_sched_entity_add(struct cfq_rb_root *st,
+ struct io_sched_entity *se)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
- struct cfq_group *__cfqg;
struct rb_node *n;
+ struct io_sched_entity *__se;
- cfqg->nr_cfqq++;
- if (cfqg->on_st)
- return;
+ if (se->on_st)
+ return;
/*
* Currently put the group at the end. Later implement something
* so that groups get lesser vtime based on their weights, so that
* if group does not loose all if it was not continously backlogged.
*/
- n = rb_last(&st->rb);
+ n = rb_last(&se->st->rb);
if (n) {
- __cfqg = rb_entry_cfqg(n);
- cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+ __se = rb_entry_se(n);
+ se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY;
} else
- cfqg->vdisktime = st->min_vdisktime;
+ se->vdisktime = st->min_vdisktime;
- __cfq_group_service_tree_add(st, cfqg);
- cfqg->on_st = true;
- st->total_weight += cfqg->weight;
+ __io_sched_entity_add(se->st, se);
+ se->on_st = true;
+ st->total_weight += se->weight;
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ struct io_sched_entity *gse = &cfqg->group_se;
+ struct io_sched_entity *qse = &cfqg->queue_se;
+ struct cfq_group *__cfqg;
+
+ cfqg->nr_cfqq++;
+
+ io_sched_entity_add(qse->st, qse);
+
+ while (gse && gse->parent) {
+ if (gse->on_st)
+ return;
+ io_sched_entity_add(gse->st, gse);
+ gse = gse->parent;
+ __cfqg = cfqg_of_gse(gse);
+ __cfqg->nr_subgp++;
+ }
+}
+
+static void io_sched_entity_del(struct io_sched_entity *se)
+{
+ if (!RB_EMPTY_NODE(&se->rb_node))
+ cfq_rb_erase(&se->rb_node, se->st);
+
+ se->on_st = false;
+ se->st->total_weight -= se->weight;
}
static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct io_sched_entity *gse = &cfqg->group_se;
+ struct io_sched_entity *qse = &cfqg->queue_se;
+ struct cfq_group *__cfqg, *p_cfqg;
+
+ if (gse->st && gse->st->active == &gse->rb_node)
+ gse->st->active = NULL;
- if (st->active == &cfqg->rb_node)
- st->active = NULL;
+ if (qse->st && qse->st->active == &qse->rb_node)
+ qse->st->active = NULL;
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
@@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
if (cfqg->nr_cfqq)
return;
- cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- cfqg->on_st = false;
- st->total_weight -= cfqg->weight;
- if (!RB_EMPTY_NODE(&cfqg->rb_node))
- cfq_rb_erase(&cfqg->rb_node, st);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ /* dequeue queue se from group */
+ io_sched_entity_del(qse);
+
+ if (cfqg->nr_subgp)
+ return;
+
+ /* prevent from dequeuing root group */
+ while (gse && gse->parent) {
+ __cfqg = cfqg_of_gse(gse);
+ p_cfqg = cfqg_of_gse(gse->parent);
+ io_sched_entity_del(gse);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ gse = gse->parent;
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct io_sched_entity *gse = &cfqg->group_se;
+ struct io_sched_entity *qse = &cfqg->queue_se;
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
@@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
- /* Can't update vdisktime while group is on service tree */
- cfq_rb_erase(&cfqg->rb_node, st);
- cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
- __cfq_group_service_tree_add(st, cfqg);
+ /*
+ * update queue se's vdisktime.
+ * Can't update vdisktime while group is on service tree.
+ */
+
+ cfq_rb_erase(&qse->rb_node, qse->st);
+ qse->vdisktime += cfq_scale_slice(charge, qse);
+ __io_sched_entity_add(qse->st, qse);
+ if (&qse->rb_node == qse->st->active)
+ qse->st->active = NULL;
+
+ while (gse && gse->parent) {
+ cfq_rb_erase(&gse->rb_node, gse->st);
+ gse->vdisktime += cfq_scale_slice(charge, gse);
+ __io_sched_entity_add(gse->st, gse);
+ if (&gse->rb_node == gse->st->active)
+ gse->st->active = NULL;
+ gse = gse->parent;
+ }
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
} else
cfqg->saved_workload_slice = 0;
- cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
- st->min_vdisktime);
+ cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime,
+ gse->st->min_vdisktime);
+
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
return NULL;
}
+static void cfq_put_cfqg(struct cfq_group *cfqg)
+{
+ struct cfq_rb_root *st;
+ int i, j;
+ struct io_sched_entity *gse;
+ struct cfq_group *p_cfqg;
+
+ BUG_ON(atomic_read(&cfqg->ref) <= 0);
+ if (!atomic_dec_and_test(&cfqg->ref))
+ return;
+ for_each_cfqg_st(cfqg, i, j, st)
+ BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+
+ gse = &cfqg->group_se;
+ if (gse->parent) {
+ p_cfqg = cfqg_of_gse(gse->parent);
+ /* Drop the reference taken by children */
+ atomic_dec(&p_cfqg->ref);
+ }
+
+ kfree(cfqg);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ /* Something wrong if we are trying to remove same group twice */
+ BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+
+ hlist_del_init(&cfqg->cfqd_node);
+
+ /*
+ * Put the reference taken at the time of creation so that when all
+ * queues are gone, group can be destroyed.
+ */
+ cfq_put_cfqg(cfqg);
+}
+
void
cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
{
- cfqg_of_blkg(blkg)->weight = weight;
+ struct cfq_group *cfqg;
+ struct io_sched_entity *gse;
+
+ cfqg = cfqg_of_blkg(blkg);
+ gse = &cfqg->group_se;
+ gse->weight = weight;
}
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_group_queue_entity(struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
+{
+ struct io_sched_entity *gse = &cfqg->group_se;
+ struct io_sched_entity *qse = &cfqg->queue_se;
+
+ gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ RB_CLEAR_NODE(&gse->rb_node);
+ gse->is_group_se = true;
+ gse->on_st = false;
+
+ /* Set to 500 for the time being */
+ qse->weight = 500;
+ RB_CLEAR_NODE(&qse->rb_node);
+ qse->is_group_se = false;
+ qse->on_st = false;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
int i, j;
struct cfq_rb_root *st;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
-
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- goto done;
+ cfqg->grp_service_tree = CFQ_RB_ROOT;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
/*
* Take the initial reference that will be released on destroy
@@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
MKDEV(major, minor));
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
-
+ init_group_queue_entity(blkcg, cfqg);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg)
+{
+ struct io_sched_entity *gse = &cfqg->group_se;
+ struct io_sched_entity *qse = &cfqg->queue_se;
+ struct io_sched_entity *p_gse = &p_cfqg->group_se;
+
+ gse->parent = p_gse;
+ gse->st = &p_cfqg->grp_service_tree;
+
+ qse->parent = gse;
+ qse->st = &cfqg->grp_service_tree;
+
+ /* child reference */
+ atomic_inc(&p_cfqg->ref);
+}
+
+int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct blkio_cgroup *p_blkcg;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg, *p_cfqg;
+ void *key = cfqd;
+ int ret;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+ /* chain has already been built */
+ return 0;
+ }
+
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+ if (!cfqg)
+ return -1;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+
+ /* Already to the top group */
+ if (!cgroup->parent)
+ return 0;
+
+ ret = cfqg_chain_alloc(cfqd, cgroup->parent);
+ if (ret == -1) {
+ uninit_cfqg(cfqd, cfqg);
+ return -1;
+ }
+
+ p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+ p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
+ BUG_ON(p_cfqg == NULL);
+
+ cfqg_set_parent(cfqg, p_cfqg);
+ return 0;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ int ret;
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
+ if (cfqg || !create)
+ goto done;
+
+ ret = cfqg_chain_alloc(cfqd, cgroup);
+ if (!ret) {
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ BUG_ON(cfqg == NULL);
+ goto done;
+ }
done:
return cfqg;
}
@@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
atomic_inc(&cfqq->cfqg->ref);
}
-static void cfq_put_cfqg(struct cfq_group *cfqg)
-{
- struct cfq_rb_root *st;
- int i, j;
-
- BUG_ON(atomic_read(&cfqg->ref) <= 0);
- if (!atomic_dec_and_test(&cfqg->ref))
- return;
- for_each_cfqg_st(cfqg, i, j, st)
- BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
- kfree(cfqg);
-}
-
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
- /* Something wrong if we are trying to remove same group twice */
- BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
-
- hlist_del_init(&cfqg->cfqd_node);
-
- /*
- * Put the reference taken at the time of creation so that when all
- * queues are gone, group can be destroyed.
- */
- cfq_put_cfqg(cfqg);
-}
-
static void cfq_release_cfq_groups(struct cfq_data *cfqd)
{
struct hlist_node *pos, *n;
@@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfqq == cfqd->active_queue)
cfqd->active_queue = NULL;
- if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
- cfqd->grp_service_tree.active = NULL;
-
if (cfqd->active_cic) {
put_io_context(cfqd->active_cic->ioc);
cfqd->active_cic = NULL;
@@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfqd->noidle_tree_requires_idle = false;
}
+
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_group *root_group = &cfqd->root_group;
+ struct cfq_rb_root *st = &root_group->grp_service_tree;
struct cfq_group *cfqg;
+ struct io_sched_entity *se;
- if (RB_EMPTY_ROOT(&st->rb))
- return NULL;
- cfqg = cfq_rb_first_group(st);
- st->active = &cfqg->rb_node;
- update_min_vdisktime(st);
- return cfqg;
+ do {
+ se = cfq_rb_first_se(st);
+ if (!se)
+ return NULL;
+ st->active = &se->rb_node;
+ update_min_vdisktime(st);
+ cfqg = cfqg_of_qse(se);
+ if (cfqg)
+ return cfqg;
+ cfqg = cfqg_of_gse(se);
+ st = &cfqg->grp_service_tree;
+ } while (1);
}
static void cfq_choose_cfqg(struct cfq_data *cfqd)
@@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
if (!cfqq)
goto new_queue;
+
if (!cfqd->rq_queued)
return NULL;
+
/*
* We were waiting for group to get backlogged. Expire the queue
*/
if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
goto expire;
+
/*
* The active queue has run out of time, expire it and select new.
*/
@@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
goto check_group_idle;
}
+
/*
* The active queue has requests and isn't expired, allow it to
* dispatch.
@@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
if (!RB_EMPTY_ROOT(&cfqq->sort_list))
goto keep_queue;
+
/*
* If another queue has a request waiting within our mean seek
* distance, let it run. The expire code will check for close
@@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
}
cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
+
return 1;
}
@@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cic_index = i;
- /* Init root service tree */
- cfqd->grp_service_tree = CFQ_RB_ROOT;
-
/* Init root group */
cfqg = &cfqd->root_group;
+ cfqg->grp_service_tree = CFQ_RB_ROOT;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
-
+ cfqg->group_se.is_group_se = true;
+ RB_CLEAR_NODE(&cfqg->group_se.rb_node);
+ cfqg->group_se.on_st = false;
/* Give preference to root group over other groups */
- cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+ cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
+ cfqg->group_se.parent = NULL;
+ cfqg->group_se.st = NULL;
+
+ cfqg->queue_se.is_group_se = false;
+ RB_CLEAR_NODE(&cfqg->queue_se.rb_node);
+ cfqg->queue_se.on_st = false;
+ cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT;
+ cfqg->queue_se.parent = &cfqg->group_se;
+ cfqg->queue_se.st = &cfqg->grp_service_tree;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-30 6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng @ 2010-08-30 18:20 ` Chad Talbott 2010-08-31 0:35 ` Gui Jianfeng 2010-08-30 20:36 ` Vivek Goyal 1 sibling, 1 reply; 19+ messages in thread From: Chad Talbott @ 2010-08-30 18:20 UTC (permalink / raw) To: Gui Jianfeng Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Sun, Aug 29, 2010 at 11:50 PM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: > o Each cfq group has its own group service tree. > o Each cfq group contains a "group schedule entity" (gse) that > schedules on parent cfq group's service tree. > o Each cfq group contains a "queue schedule entity"(qse), it > represents all cfqqs located on this cfq group. It schedules > on this group's service tree. For the time being, root group > qse's weight is 1000, and subgroup qse's weight is 500. > o All gses and qse which belones to a same cfq group schedules > on the same group service tree. Overall, I like this approach. It's similar to what we are using with earlier hierarchical patches. Can you improve the naming a little bit? "qse" and "gse" look almost identical and breaks my brain. Maybe "queue_entity" and "group_entity"? If entity is to much to type, then "item"? Chad ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-30 18:20 ` Chad Talbott @ 2010-08-31 0:35 ` Gui Jianfeng 0 siblings, 0 replies; 19+ messages in thread From: Gui Jianfeng @ 2010-08-31 0:35 UTC (permalink / raw) To: Chad Talbott Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list Chad Talbott wrote: > On Sun, Aug 29, 2010 at 11:50 PM, Gui Jianfeng > <guijianfeng@cn.fujitsu.com> wrote: >> o Each cfq group has its own group service tree. >> o Each cfq group contains a "group schedule entity" (gse) that >> schedules on parent cfq group's service tree. >> o Each cfq group contains a "queue schedule entity"(qse), it >> represents all cfqqs located on this cfq group. It schedules >> on this group's service tree. For the time being, root group >> qse's weight is 1000, and subgroup qse's weight is 500. >> o All gses and qse which belones to a same cfq group schedules >> on the same group service tree. > > Overall, I like this approach. It's similar to what we are using with > earlier hierarchical patches. Can you improve the naming a little > bit? "qse" and "gse" look almost identical and breaks my brain. > Maybe "queue_entity" and "group_entity"? If entity is to much to > type, then "item"? Thanks for reviewing Chad. I will modify the name. Thanks, Gui > > Chad > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-30 6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng 2010-08-30 18:20 ` Chad Talbott @ 2010-08-30 20:36 ` Vivek Goyal 2010-08-31 0:29 ` Gui Jianfeng 1 sibling, 1 reply; 19+ messages in thread From: Vivek Goyal @ 2010-08-30 20:36 UTC (permalink / raw) To: Gui Jianfeng Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > Hi All, > > This patch enables cfq group hierarchical scheduling. > > With this patch, you can create a cgroup directory deeper than level 1. > Now, I/O Bandwidth is distributed in a hierarchy way. For example: > We create cgroup directories as following(the number represents weight): > > Root grp > / \ > grp_1(100) grp_2(400) > / \ > grp_3(200) grp_4(300) > > If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > grp_2 will share 80% of total bandwidth. > For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > > Design: > o Each cfq group has its own group service tree. > o Each cfq group contains a "group schedule entity" (gse) that > schedules on parent cfq group's service tree. > o Each cfq group contains a "queue schedule entity"(qse), it > represents all cfqqs located on this cfq group. It schedules > on this group's service tree. For the time being, root group > qse's weight is 1000, and subgroup qse's weight is 500. > o All gses and qse which belones to a same cfq group schedules > on the same group service tree. Hi Gui, Thanks for the patch. I have few questions. - So how does the hierarchy look like, w.r.t root group. Something as follows? root / | \ q1 q2 G1 Assume there are two processes doin IO in root group and q1 and q2 are cfqq queues for those processes and G1 is the cgroup created by user. If yes, then what algorithm do you use to do scheduling between q1, q2 and G1? IOW, currently we have two algorithms operating in CFQ. One for cfqq and other for groups. Group algorithm does not use the logic of cfq_slice_offset(). The reason being they are different because CFQ wanted to provide ioprio service differentiation on SSD where slice idle is effectively zero. But that slice_offset() logic is approximate at best and is not suitable for group scheduling. I have yet to go through the patches in detail, so thought will ask you that how have you handeled above. Vivek > o cfq group allocates in a recursive manner, that means when a cfq > group needs to be allocated, the upper level cfq groups are also > allocated. > o When a cfq group served, not only charge this cfq group but also > charge its ancestors. > > Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> > --- > block/blk-cgroup.c | 4 - > block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++------------- > 2 files changed, 353 insertions(+), 128 deletions(-) > > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > index 2fef1ef..aa06cb3 100644 > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) > goto done; > } > > - /* Currently we do not support hierarchy deeper than two level (0,1) */ > - if (parent != cgroup->top_cgroup) > - return ERR_PTR(-EPERM); > - > blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); > if (!blkcg) > return ERR_PTR(-ENOMEM); > diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c > index f65c6f0..4660d89 100644 > --- a/block/cfq-iosched.c > +++ b/block/cfq-iosched.c > @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida); > #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) > > #define sample_valid(samples) ((samples) > 80) > -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) > +#define rb_entry_se(node) \ > + rb_entry((node), struct io_sched_entity, rb_node) > > /* > * Most of our rbtree usage is for sorting with min extraction, so > @@ -171,19 +172,40 @@ enum wl_type_t { > SYNC_WORKLOAD = 2 > }; > > -/* This is per cgroup per device grouping structure */ > -struct cfq_group { > +/* > + * This's the schedule entity which is scheduled on group service tree. > + * It represents shedule structure of a cfq group, or a bundle of cfq > + * queues that locate in a cfq group. > + */ > +struct io_sched_entity { > + struct cfq_rb_root *st; > + > /* group service_tree member */ > struct rb_node rb_node; > - > - /* group service_tree key */ > u64 vdisktime; > unsigned int weight; > bool on_st; > + bool is_group_se; > + struct io_sched_entity *parent; > +}; > + > +/* This is per cgroup per device grouping structure */ > +struct cfq_group { > + /* cfq group sched entity */ > + struct io_sched_entity group_se; > + > + /* sched entity for cfqqs */ > + struct io_sched_entity queue_se; > + > + /* Service tree for cfq_groups and cfqqs set*/ > + struct cfq_rb_root grp_service_tree; > > /* number of cfqq currently on this group */ > int nr_cfqq; > > + /* number of sub cfq groups */ > + int nr_subgp; > + > /* Per group busy queus average. Useful for workload slice calc. */ > unsigned int busy_queues_avg[2]; > /* > @@ -210,8 +232,6 @@ struct cfq_group { > */ > struct cfq_data { > struct request_queue *queue; > - /* Root service tree for cfq_groups */ > - struct cfq_rb_root grp_service_tree; > struct cfq_group root_group; > > /* > @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd) > return false; > } > > +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se) > +{ > + if (se->is_group_se) > + return container_of(se, struct cfq_group, group_se); > + else > + return NULL; > +} > + > +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se) > +{ > + if (!se->is_group_se) > + return container_of(se, struct cfq_group, queue_se); > + else > + return NULL; > +} > + > static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) > { > if (cfq_class_idle(cfqq)) > @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) > return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); > } > > -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) > +static inline u64 cfq_scale_slice(unsigned long delta, > + struct io_sched_entity *se) > { > u64 d = delta << CFQ_SERVICE_SHIFT; > > d = d * BLKIO_WEIGHT_DEFAULT; > - do_div(d, cfqg->weight); > + do_div(d, se->weight); > return d; > } > > @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) > static void update_min_vdisktime(struct cfq_rb_root *st) > { > u64 vdisktime = st->min_vdisktime; > - struct cfq_group *cfqg; > + struct io_sched_entity *se; > > if (st->active) { > - cfqg = rb_entry_cfqg(st->active); > - vdisktime = cfqg->vdisktime; > + se = rb_entry_se(st->active); > + vdisktime = se->vdisktime; > } > > if (st->left) { > - cfqg = rb_entry_cfqg(st->left); > - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); > + se = rb_entry_se(st->left); > + vdisktime = min_vdisktime(vdisktime, se->vdisktime); > } > > st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); > @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, > static inline unsigned > cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) > { > - struct cfq_rb_root *st = &cfqd->grp_service_tree; > + struct io_sched_entity *gse = &cfqg->queue_se; > + struct cfq_rb_root *st = gse->st; > > - return cfq_target_latency * cfqg->weight / st->total_weight; > + return cfq_target_latency * gse->weight / st->total_weight; > } > > static inline void > @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) > return NULL; > } > > -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) > +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root) > { > if (!root->left) > root->left = rb_first(&root->rb); > > if (root->left) > - return rb_entry_cfqg(root->left); > + return rb_entry_se(root->left); > > return NULL; > } > @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, > } > > static inline s64 > -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) > +se_key(struct cfq_rb_root *st, struct io_sched_entity *se) > { > - return cfqg->vdisktime - st->min_vdisktime; > + return se->vdisktime - st->min_vdisktime; > } > > static void > -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) > +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se) > { > struct rb_node **node = &st->rb.rb_node; > struct rb_node *parent = NULL; > - struct cfq_group *__cfqg; > - s64 key = cfqg_key(st, cfqg); > + struct io_sched_entity *__se; > + s64 key = se_key(st, se); > int left = 1; > > while (*node != NULL) { > parent = *node; > - __cfqg = rb_entry_cfqg(parent); > + __se = rb_entry_se(parent); > > - if (key < cfqg_key(st, __cfqg)) > + if (key < se_key(st, __se)) > node = &parent->rb_left; > else { > node = &parent->rb_right; > @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) > } > > if (left) > - st->left = &cfqg->rb_node; > + st->left = &se->rb_node; > > - rb_link_node(&cfqg->rb_node, parent, node); > - rb_insert_color(&cfqg->rb_node, &st->rb); > + rb_link_node(&se->rb_node, parent, node); > + rb_insert_color(&se->rb_node, &st->rb); > } > > -static void > -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) > +static void io_sched_entity_add(struct cfq_rb_root *st, > + struct io_sched_entity *se) > { > - struct cfq_rb_root *st = &cfqd->grp_service_tree; > - struct cfq_group *__cfqg; > struct rb_node *n; > + struct io_sched_entity *__se; > > - cfqg->nr_cfqq++; > - if (cfqg->on_st) > - return; > > + if (se->on_st) > + return; > /* > * Currently put the group at the end. Later implement something > * so that groups get lesser vtime based on their weights, so that > * if group does not loose all if it was not continously backlogged. > */ > - n = rb_last(&st->rb); > + n = rb_last(&se->st->rb); > if (n) { > - __cfqg = rb_entry_cfqg(n); > - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; > + __se = rb_entry_se(n); > + se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY; > } else > - cfqg->vdisktime = st->min_vdisktime; > + se->vdisktime = st->min_vdisktime; > > - __cfq_group_service_tree_add(st, cfqg); > - cfqg->on_st = true; > - st->total_weight += cfqg->weight; > + __io_sched_entity_add(se->st, se); > + se->on_st = true; > + st->total_weight += se->weight; > +} > + > +static void > +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) > +{ > + struct io_sched_entity *gse = &cfqg->group_se; > + struct io_sched_entity *qse = &cfqg->queue_se; > + struct cfq_group *__cfqg; > + > + cfqg->nr_cfqq++; > + > + io_sched_entity_add(qse->st, qse); > + > + while (gse && gse->parent) { > + if (gse->on_st) > + return; > + io_sched_entity_add(gse->st, gse); > + gse = gse->parent; > + __cfqg = cfqg_of_gse(gse); > + __cfqg->nr_subgp++; > + } > +} > + > +static void io_sched_entity_del(struct io_sched_entity *se) > +{ > + if (!RB_EMPTY_NODE(&se->rb_node)) > + cfq_rb_erase(&se->rb_node, se->st); > + > + se->on_st = false; > + se->st->total_weight -= se->weight; > } > > static void > cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) > { > - struct cfq_rb_root *st = &cfqd->grp_service_tree; > + struct io_sched_entity *gse = &cfqg->group_se; > + struct io_sched_entity *qse = &cfqg->queue_se; > + struct cfq_group *__cfqg, *p_cfqg; > + > + if (gse->st && gse->st->active == &gse->rb_node) > + gse->st->active = NULL; > > - if (st->active == &cfqg->rb_node) > - st->active = NULL; > + if (qse->st && qse->st->active == &qse->rb_node) > + qse->st->active = NULL; > > BUG_ON(cfqg->nr_cfqq < 1); > cfqg->nr_cfqq--; > @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) > if (cfqg->nr_cfqq) > return; > > - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); > - cfqg->on_st = false; > - st->total_weight -= cfqg->weight; > - if (!RB_EMPTY_NODE(&cfqg->rb_node)) > - cfq_rb_erase(&cfqg->rb_node, st); > - cfqg->saved_workload_slice = 0; > - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); > + /* dequeue queue se from group */ > + io_sched_entity_del(qse); > + > + if (cfqg->nr_subgp) > + return; > + > + /* prevent from dequeuing root group */ > + while (gse && gse->parent) { > + __cfqg = cfqg_of_gse(gse); > + p_cfqg = cfqg_of_gse(gse->parent); > + io_sched_entity_del(gse); > + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); > + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); > + __cfqg->saved_workload_slice = 0; > + gse = gse->parent; > + p_cfqg->nr_subgp--; > + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) > + return; > + } > } > > static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) > @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) > static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > struct cfq_queue *cfqq) > { > - struct cfq_rb_root *st = &cfqd->grp_service_tree; > + struct io_sched_entity *gse = &cfqg->group_se; > + struct io_sched_entity *qse = &cfqg->queue_se; > unsigned int used_sl, charge; > int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) > - cfqg->service_tree_idle.count; > @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > else if (!cfq_cfqq_sync(cfqq) && !nr_sync) > charge = cfqq->allocated_slice; > > - /* Can't update vdisktime while group is on service tree */ > - cfq_rb_erase(&cfqg->rb_node, st); > - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); > - __cfq_group_service_tree_add(st, cfqg); > + /* > + * update queue se's vdisktime. > + * Can't update vdisktime while group is on service tree. > + */ > + > + cfq_rb_erase(&qse->rb_node, qse->st); > + qse->vdisktime += cfq_scale_slice(charge, qse); > + __io_sched_entity_add(qse->st, qse); > + if (&qse->rb_node == qse->st->active) > + qse->st->active = NULL; > + > + while (gse && gse->parent) { > + cfq_rb_erase(&gse->rb_node, gse->st); > + gse->vdisktime += cfq_scale_slice(charge, gse); > + __io_sched_entity_add(gse->st, gse); > + if (&gse->rb_node == gse->st->active) > + gse->st->active = NULL; > + gse = gse->parent; > + } > > /* This group is being expired. Save the context */ > if (time_after(cfqd->workload_expires, jiffies)) { > @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > } else > cfqg->saved_workload_slice = 0; > > - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, > - st->min_vdisktime); > + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime, > + gse->st->min_vdisktime); > + > cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" > " sect=%u", used_sl, cfqq->slice_dispatch, charge, > iops_mode(cfqd), cfqq->nr_sectors); > @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) > return NULL; > } > > +static void cfq_put_cfqg(struct cfq_group *cfqg) > +{ > + struct cfq_rb_root *st; > + int i, j; > + struct io_sched_entity *gse; > + struct cfq_group *p_cfqg; > + > + BUG_ON(atomic_read(&cfqg->ref) <= 0); > + if (!atomic_dec_and_test(&cfqg->ref)) > + return; > + for_each_cfqg_st(cfqg, i, j, st) > + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); > + > + gse = &cfqg->group_se; > + if (gse->parent) { > + p_cfqg = cfqg_of_gse(gse->parent); > + /* Drop the reference taken by children */ > + atomic_dec(&p_cfqg->ref); > + } > + > + kfree(cfqg); > +} > + > +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > +{ > + /* Something wrong if we are trying to remove same group twice */ > + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); > + > + hlist_del_init(&cfqg->cfqd_node); > + > + /* > + * Put the reference taken at the time of creation so that when all > + * queues are gone, group can be destroyed. > + */ > + cfq_put_cfqg(cfqg); > +} > + > void > cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) > { > - cfqg_of_blkg(blkg)->weight = weight; > + struct cfq_group *cfqg; > + struct io_sched_entity *gse; > + > + cfqg = cfqg_of_blkg(blkg); > + gse = &cfqg->group_se; > + gse->weight = weight; > } > > -static struct cfq_group * > -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > +static void init_group_queue_entity(struct blkio_cgroup *blkcg, > + struct cfq_group *cfqg) > +{ > + struct io_sched_entity *gse = &cfqg->group_se; > + struct io_sched_entity *qse = &cfqg->queue_se; > + > + gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); > + RB_CLEAR_NODE(&gse->rb_node); > + gse->is_group_se = true; > + gse->on_st = false; > + > + /* Set to 500 for the time being */ > + qse->weight = 500; > + RB_CLEAR_NODE(&qse->rb_node); > + qse->is_group_se = false; > + qse->on_st = false; > +} > + > +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, > + struct cfq_group *cfqg) > { > - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > - struct cfq_group *cfqg = NULL; > - void *key = cfqd; > int i, j; > struct cfq_rb_root *st; > - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > unsigned int major, minor; > + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > > - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > - cfqg->blkg.dev = MKDEV(major, minor); > - goto done; > - } > - if (cfqg || !create) > - goto done; > - > - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); > - if (!cfqg) > - goto done; > + cfqg->grp_service_tree = CFQ_RB_ROOT; > > for_each_cfqg_st(cfqg, i, j, st) > *st = CFQ_RB_ROOT; > - RB_CLEAR_NODE(&cfqg->rb_node); > > /* > * Take the initial reference that will be released on destroy > @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, > MKDEV(major, minor)); > - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); > - > + init_group_queue_entity(blkcg, cfqg); > /* Add group on cfqd list */ > hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); > +} > + > +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > +{ > + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) > + cfq_destroy_cfqg(cfqd, cfqg); > +} > + > +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg) > +{ > + struct io_sched_entity *gse = &cfqg->group_se; > + struct io_sched_entity *qse = &cfqg->queue_se; > + struct io_sched_entity *p_gse = &p_cfqg->group_se; > + > + gse->parent = p_gse; > + gse->st = &p_cfqg->grp_service_tree; > + > + qse->parent = gse; > + qse->st = &cfqg->grp_service_tree; > + > + /* child reference */ > + atomic_inc(&p_cfqg->ref); > +} > + > +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) > +{ > + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > + struct blkio_cgroup *p_blkcg; > + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > + unsigned int major, minor; > + struct cfq_group *cfqg, *p_cfqg; > + void *key = cfqd; > + int ret; > + > + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > + if (cfqg) { > + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > + cfqg->blkg.dev = MKDEV(major, minor); > + } > + /* chain has already been built */ > + return 0; > + } > + > + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); > + if (!cfqg) > + return -1; > + > + init_cfqg(cfqd, blkcg, cfqg); > + > + /* Already to the top group */ > + if (!cgroup->parent) > + return 0; > + > + ret = cfqg_chain_alloc(cfqd, cgroup->parent); > + if (ret == -1) { > + uninit_cfqg(cfqd, cfqg); > + return -1; > + } > + > + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); > + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key)); > + BUG_ON(p_cfqg == NULL); > + > + cfqg_set_parent(cfqg, p_cfqg); > + return 0; > +} > + > +static struct cfq_group * > +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > +{ > + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > + struct cfq_group *cfqg = NULL; > + void *key = cfqd; > + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > + unsigned int major, minor; > + int ret; > > + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > + cfqg->blkg.dev = MKDEV(major, minor); > + goto done; > + } > + if (cfqg || !create) > + goto done; > + > + ret = cfqg_chain_alloc(cfqd, cgroup); > + if (!ret) { > + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > + BUG_ON(cfqg == NULL); > + goto done; > + } > done: > return cfqg; > } > @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) > atomic_inc(&cfqq->cfqg->ref); > } > > -static void cfq_put_cfqg(struct cfq_group *cfqg) > -{ > - struct cfq_rb_root *st; > - int i, j; > - > - BUG_ON(atomic_read(&cfqg->ref) <= 0); > - if (!atomic_dec_and_test(&cfqg->ref)) > - return; > - for_each_cfqg_st(cfqg, i, j, st) > - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); > - kfree(cfqg); > -} > - > -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > -{ > - /* Something wrong if we are trying to remove same group twice */ > - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); > - > - hlist_del_init(&cfqg->cfqd_node); > - > - /* > - * Put the reference taken at the time of creation so that when all > - * queues are gone, group can be destroyed. > - */ > - cfq_put_cfqg(cfqg); > -} > - > static void cfq_release_cfq_groups(struct cfq_data *cfqd) > { > struct hlist_node *pos, *n; > @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, > if (cfqq == cfqd->active_queue) > cfqd->active_queue = NULL; > > - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) > - cfqd->grp_service_tree.active = NULL; > - > if (cfqd->active_cic) { > put_io_context(cfqd->active_cic->ioc); > cfqd->active_cic = NULL; > @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) > cfqd->noidle_tree_requires_idle = false; > } > > + > static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) > { > - struct cfq_rb_root *st = &cfqd->grp_service_tree; > + struct cfq_group *root_group = &cfqd->root_group; > + struct cfq_rb_root *st = &root_group->grp_service_tree; > struct cfq_group *cfqg; > + struct io_sched_entity *se; > > - if (RB_EMPTY_ROOT(&st->rb)) > - return NULL; > - cfqg = cfq_rb_first_group(st); > - st->active = &cfqg->rb_node; > - update_min_vdisktime(st); > - return cfqg; > + do { > + se = cfq_rb_first_se(st); > + if (!se) > + return NULL; > + st->active = &se->rb_node; > + update_min_vdisktime(st); > + cfqg = cfqg_of_qse(se); > + if (cfqg) > + return cfqg; > + cfqg = cfqg_of_gse(se); > + st = &cfqg->grp_service_tree; > + } while (1); > } > > static void cfq_choose_cfqg(struct cfq_data *cfqd) > @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > if (!cfqq) > goto new_queue; > > + > if (!cfqd->rq_queued) > return NULL; > > + > /* > * We were waiting for group to get backlogged. Expire the queue > */ > if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) > goto expire; > > + > /* > * The active queue has run out of time, expire it and select new. > */ > @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > goto check_group_idle; > } > > + > /* > * The active queue has requests and isn't expired, allow it to > * dispatch. > @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > if (!RB_EMPTY_ROOT(&cfqq->sort_list)) > goto keep_queue; > > + > /* > * If another queue has a request waiting within our mean seek > * distance, let it run. The expire code will check for close > @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) > } > > cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); > + > return 1; > } > > @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q) > > cfqd->cic_index = i; > > - /* Init root service tree */ > - cfqd->grp_service_tree = CFQ_RB_ROOT; > - > /* Init root group */ > cfqg = &cfqd->root_group; > + cfqg->grp_service_tree = CFQ_RB_ROOT; > for_each_cfqg_st(cfqg, i, j, st) > *st = CFQ_RB_ROOT; > - RB_CLEAR_NODE(&cfqg->rb_node); > - > + cfqg->group_se.is_group_se = true; > + RB_CLEAR_NODE(&cfqg->group_se.rb_node); > + cfqg->group_se.on_st = false; > /* Give preference to root group over other groups */ > - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; > + cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT; > + cfqg->group_se.parent = NULL; > + cfqg->group_se.st = NULL; > + > + cfqg->queue_se.is_group_se = false; > + RB_CLEAR_NODE(&cfqg->queue_se.rb_node); > + cfqg->queue_se.on_st = false; > + cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT; > + cfqg->queue_se.parent = &cfqg->group_se; > + cfqg->queue_se.st = &cfqg->grp_service_tree; > > #ifdef CONFIG_CFQ_GROUP_IOSCHED > /* > -- > 1.6.5.2 > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-30 20:36 ` Vivek Goyal @ 2010-08-31 0:29 ` Gui Jianfeng 2010-08-31 12:57 ` Vivek Goyal 0 siblings, 1 reply; 19+ messages in thread From: Gui Jianfeng @ 2010-08-31 0:29 UTC (permalink / raw) To: Vivek Goyal Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list Vivek Goyal wrote: > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >> Hi All, >> >> This patch enables cfq group hierarchical scheduling. >> >> With this patch, you can create a cgroup directory deeper than level 1. >> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >> We create cgroup directories as following(the number represents weight): >> >> Root grp >> / \ >> grp_1(100) grp_2(400) >> / \ >> grp_3(200) grp_4(300) >> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >> grp_2 will share 80% of total bandwidth. >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >> >> Design: >> o Each cfq group has its own group service tree. >> o Each cfq group contains a "group schedule entity" (gse) that >> schedules on parent cfq group's service tree. >> o Each cfq group contains a "queue schedule entity"(qse), it >> represents all cfqqs located on this cfq group. It schedules >> on this group's service tree. For the time being, root group >> qse's weight is 1000, and subgroup qse's weight is 500. >> o All gses and qse which belones to a same cfq group schedules >> on the same group service tree. > > Hi Gui, > > Thanks for the patch. I have few questions. > > - So how does the hierarchy look like, w.r.t root group. Something as > follows? > > > root > / | \ > q1 q2 G1 > > Assume there are two processes doin IO in root group and q1 and q2 are > cfqq queues for those processes and G1 is the cgroup created by user. > > If yes, then what algorithm do you use to do scheduling between q1, q2 > and G1? IOW, currently we have two algorithms operating in CFQ. One for > cfqq and other for groups. Group algorithm does not use the logic of > cfq_slice_offset(). Hi Vivek, This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and it will schedule on root group service with G1, as following: root group / \ qse(q1,q2) gse(G1) Thanks, Gui > > The reason being they are different because CFQ wanted to provide ioprio > service differentiation on SSD where slice idle is effectively zero. But > that slice_offset() logic is approximate at best and is not suitable > for group scheduling. > > I have yet to go through the patches in detail, so thought will ask you > that how have you handeled above. > > Vivek > >> o cfq group allocates in a recursive manner, that means when a cfq >> group needs to be allocated, the upper level cfq groups are also >> allocated. >> o When a cfq group served, not only charge this cfq group but also >> charge its ancestors. >> >> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> >> --- >> block/blk-cgroup.c | 4 - >> block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++------------- >> 2 files changed, 353 insertions(+), 128 deletions(-) >> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c >> index 2fef1ef..aa06cb3 100644 >> --- a/block/blk-cgroup.c >> +++ b/block/blk-cgroup.c >> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) >> goto done; >> } >> >> - /* Currently we do not support hierarchy deeper than two level (0,1) */ >> - if (parent != cgroup->top_cgroup) >> - return ERR_PTR(-EPERM); >> - >> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); >> if (!blkcg) >> return ERR_PTR(-ENOMEM); >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c >> index f65c6f0..4660d89 100644 >> --- a/block/cfq-iosched.c >> +++ b/block/cfq-iosched.c >> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida); >> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) >> >> #define sample_valid(samples) ((samples) > 80) >> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) >> +#define rb_entry_se(node) \ >> + rb_entry((node), struct io_sched_entity, rb_node) >> >> /* >> * Most of our rbtree usage is for sorting with min extraction, so >> @@ -171,19 +172,40 @@ enum wl_type_t { >> SYNC_WORKLOAD = 2 >> }; >> >> -/* This is per cgroup per device grouping structure */ >> -struct cfq_group { >> +/* >> + * This's the schedule entity which is scheduled on group service tree. >> + * It represents shedule structure of a cfq group, or a bundle of cfq >> + * queues that locate in a cfq group. >> + */ >> +struct io_sched_entity { >> + struct cfq_rb_root *st; >> + >> /* group service_tree member */ >> struct rb_node rb_node; >> - >> - /* group service_tree key */ >> u64 vdisktime; >> unsigned int weight; >> bool on_st; >> + bool is_group_se; >> + struct io_sched_entity *parent; >> +}; >> + >> +/* This is per cgroup per device grouping structure */ >> +struct cfq_group { >> + /* cfq group sched entity */ >> + struct io_sched_entity group_se; >> + >> + /* sched entity for cfqqs */ >> + struct io_sched_entity queue_se; >> + >> + /* Service tree for cfq_groups and cfqqs set*/ >> + struct cfq_rb_root grp_service_tree; >> >> /* number of cfqq currently on this group */ >> int nr_cfqq; >> >> + /* number of sub cfq groups */ >> + int nr_subgp; >> + >> /* Per group busy queus average. Useful for workload slice calc. */ >> unsigned int busy_queues_avg[2]; >> /* >> @@ -210,8 +232,6 @@ struct cfq_group { >> */ >> struct cfq_data { >> struct request_queue *queue; >> - /* Root service tree for cfq_groups */ >> - struct cfq_rb_root grp_service_tree; >> struct cfq_group root_group; >> >> /* >> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd) >> return false; >> } >> >> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se) >> +{ >> + if (se->is_group_se) >> + return container_of(se, struct cfq_group, group_se); >> + else >> + return NULL; >> +} >> + >> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se) >> +{ >> + if (!se->is_group_se) >> + return container_of(se, struct cfq_group, queue_se); >> + else >> + return NULL; >> +} >> + >> static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) >> { >> if (cfq_class_idle(cfqq)) >> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) >> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); >> } >> >> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) >> +static inline u64 cfq_scale_slice(unsigned long delta, >> + struct io_sched_entity *se) >> { >> u64 d = delta << CFQ_SERVICE_SHIFT; >> >> d = d * BLKIO_WEIGHT_DEFAULT; >> - do_div(d, cfqg->weight); >> + do_div(d, se->weight); >> return d; >> } >> >> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) >> static void update_min_vdisktime(struct cfq_rb_root *st) >> { >> u64 vdisktime = st->min_vdisktime; >> - struct cfq_group *cfqg; >> + struct io_sched_entity *se; >> >> if (st->active) { >> - cfqg = rb_entry_cfqg(st->active); >> - vdisktime = cfqg->vdisktime; >> + se = rb_entry_se(st->active); >> + vdisktime = se->vdisktime; >> } >> >> if (st->left) { >> - cfqg = rb_entry_cfqg(st->left); >> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); >> + se = rb_entry_se(st->left); >> + vdisktime = min_vdisktime(vdisktime, se->vdisktime); >> } >> >> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); >> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, >> static inline unsigned >> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) >> { >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> + struct io_sched_entity *gse = &cfqg->queue_se; >> + struct cfq_rb_root *st = gse->st; >> >> - return cfq_target_latency * cfqg->weight / st->total_weight; >> + return cfq_target_latency * gse->weight / st->total_weight; >> } >> >> static inline void >> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) >> return NULL; >> } >> >> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) >> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root) >> { >> if (!root->left) >> root->left = rb_first(&root->rb); >> >> if (root->left) >> - return rb_entry_cfqg(root->left); >> + return rb_entry_se(root->left); >> >> return NULL; >> } >> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, >> } >> >> static inline s64 >> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) >> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se) >> { >> - return cfqg->vdisktime - st->min_vdisktime; >> + return se->vdisktime - st->min_vdisktime; >> } >> >> static void >> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se) >> { >> struct rb_node **node = &st->rb.rb_node; >> struct rb_node *parent = NULL; >> - struct cfq_group *__cfqg; >> - s64 key = cfqg_key(st, cfqg); >> + struct io_sched_entity *__se; >> + s64 key = se_key(st, se); >> int left = 1; >> >> while (*node != NULL) { >> parent = *node; >> - __cfqg = rb_entry_cfqg(parent); >> + __se = rb_entry_se(parent); >> >> - if (key < cfqg_key(st, __cfqg)) >> + if (key < se_key(st, __se)) >> node = &parent->rb_left; >> else { >> node = &parent->rb_right; >> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >> } >> >> if (left) >> - st->left = &cfqg->rb_node; >> + st->left = &se->rb_node; >> >> - rb_link_node(&cfqg->rb_node, parent, node); >> - rb_insert_color(&cfqg->rb_node, &st->rb); >> + rb_link_node(&se->rb_node, parent, node); >> + rb_insert_color(&se->rb_node, &st->rb); >> } >> >> -static void >> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >> +static void io_sched_entity_add(struct cfq_rb_root *st, >> + struct io_sched_entity *se) >> { >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> - struct cfq_group *__cfqg; >> struct rb_node *n; >> + struct io_sched_entity *__se; >> >> - cfqg->nr_cfqq++; >> - if (cfqg->on_st) >> - return; >> >> + if (se->on_st) >> + return; >> /* >> * Currently put the group at the end. Later implement something >> * so that groups get lesser vtime based on their weights, so that >> * if group does not loose all if it was not continously backlogged. >> */ >> - n = rb_last(&st->rb); >> + n = rb_last(&se->st->rb); >> if (n) { >> - __cfqg = rb_entry_cfqg(n); >> - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; >> + __se = rb_entry_se(n); >> + se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY; >> } else >> - cfqg->vdisktime = st->min_vdisktime; >> + se->vdisktime = st->min_vdisktime; >> >> - __cfq_group_service_tree_add(st, cfqg); >> - cfqg->on_st = true; >> - st->total_weight += cfqg->weight; >> + __io_sched_entity_add(se->st, se); >> + se->on_st = true; >> + st->total_weight += se->weight; >> +} >> + >> +static void >> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >> +{ >> + struct io_sched_entity *gse = &cfqg->group_se; >> + struct io_sched_entity *qse = &cfqg->queue_se; >> + struct cfq_group *__cfqg; >> + >> + cfqg->nr_cfqq++; >> + >> + io_sched_entity_add(qse->st, qse); >> + >> + while (gse && gse->parent) { >> + if (gse->on_st) >> + return; >> + io_sched_entity_add(gse->st, gse); >> + gse = gse->parent; >> + __cfqg = cfqg_of_gse(gse); >> + __cfqg->nr_subgp++; >> + } >> +} >> + >> +static void io_sched_entity_del(struct io_sched_entity *se) >> +{ >> + if (!RB_EMPTY_NODE(&se->rb_node)) >> + cfq_rb_erase(&se->rb_node, se->st); >> + >> + se->on_st = false; >> + se->st->total_weight -= se->weight; >> } >> >> static void >> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >> { >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> + struct io_sched_entity *gse = &cfqg->group_se; >> + struct io_sched_entity *qse = &cfqg->queue_se; >> + struct cfq_group *__cfqg, *p_cfqg; >> + >> + if (gse->st && gse->st->active == &gse->rb_node) >> + gse->st->active = NULL; >> >> - if (st->active == &cfqg->rb_node) >> - st->active = NULL; >> + if (qse->st && qse->st->active == &qse->rb_node) >> + qse->st->active = NULL; >> >> BUG_ON(cfqg->nr_cfqq < 1); >> cfqg->nr_cfqq--; >> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >> if (cfqg->nr_cfqq) >> return; >> >> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); >> - cfqg->on_st = false; >> - st->total_weight -= cfqg->weight; >> - if (!RB_EMPTY_NODE(&cfqg->rb_node)) >> - cfq_rb_erase(&cfqg->rb_node, st); >> - cfqg->saved_workload_slice = 0; >> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); >> + /* dequeue queue se from group */ >> + io_sched_entity_del(qse); >> + >> + if (cfqg->nr_subgp) >> + return; >> + >> + /* prevent from dequeuing root group */ >> + while (gse && gse->parent) { >> + __cfqg = cfqg_of_gse(gse); >> + p_cfqg = cfqg_of_gse(gse->parent); >> + io_sched_entity_del(gse); >> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); >> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); >> + __cfqg->saved_workload_slice = 0; >> + gse = gse->parent; >> + p_cfqg->nr_subgp--; >> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) >> + return; >> + } >> } >> >> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> struct cfq_queue *cfqq) >> { >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> + struct io_sched_entity *gse = &cfqg->group_se; >> + struct io_sched_entity *qse = &cfqg->queue_se; >> unsigned int used_sl, charge; >> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) >> - cfqg->service_tree_idle.count; >> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> else if (!cfq_cfqq_sync(cfqq) && !nr_sync) >> charge = cfqq->allocated_slice; >> >> - /* Can't update vdisktime while group is on service tree */ >> - cfq_rb_erase(&cfqg->rb_node, st); >> - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); >> - __cfq_group_service_tree_add(st, cfqg); >> + /* >> + * update queue se's vdisktime. >> + * Can't update vdisktime while group is on service tree. >> + */ >> + >> + cfq_rb_erase(&qse->rb_node, qse->st); >> + qse->vdisktime += cfq_scale_slice(charge, qse); >> + __io_sched_entity_add(qse->st, qse); >> + if (&qse->rb_node == qse->st->active) >> + qse->st->active = NULL; >> + >> + while (gse && gse->parent) { >> + cfq_rb_erase(&gse->rb_node, gse->st); >> + gse->vdisktime += cfq_scale_slice(charge, gse); >> + __io_sched_entity_add(gse->st, gse); >> + if (&gse->rb_node == gse->st->active) >> + gse->st->active = NULL; >> + gse = gse->parent; >> + } >> >> /* This group is being expired. Save the context */ >> if (time_after(cfqd->workload_expires, jiffies)) { >> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> } else >> cfqg->saved_workload_slice = 0; >> >> - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, >> - st->min_vdisktime); >> + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime, >> + gse->st->min_vdisktime); >> + >> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" >> " sect=%u", used_sl, cfqq->slice_dispatch, charge, >> iops_mode(cfqd), cfqq->nr_sectors); >> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) >> return NULL; >> } >> >> +static void cfq_put_cfqg(struct cfq_group *cfqg) >> +{ >> + struct cfq_rb_root *st; >> + int i, j; >> + struct io_sched_entity *gse; >> + struct cfq_group *p_cfqg; >> + >> + BUG_ON(atomic_read(&cfqg->ref) <= 0); >> + if (!atomic_dec_and_test(&cfqg->ref)) >> + return; >> + for_each_cfqg_st(cfqg, i, j, st) >> + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >> + >> + gse = &cfqg->group_se; >> + if (gse->parent) { >> + p_cfqg = cfqg_of_gse(gse->parent); >> + /* Drop the reference taken by children */ >> + atomic_dec(&p_cfqg->ref); >> + } >> + >> + kfree(cfqg); >> +} >> + >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> +{ >> + /* Something wrong if we are trying to remove same group twice */ >> + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >> + >> + hlist_del_init(&cfqg->cfqd_node); >> + >> + /* >> + * Put the reference taken at the time of creation so that when all >> + * queues are gone, group can be destroyed. >> + */ >> + cfq_put_cfqg(cfqg); >> +} >> + >> void >> cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) >> { >> - cfqg_of_blkg(blkg)->weight = weight; >> + struct cfq_group *cfqg; >> + struct io_sched_entity *gse; >> + >> + cfqg = cfqg_of_blkg(blkg); >> + gse = &cfqg->group_se; >> + gse->weight = weight; >> } >> >> -static struct cfq_group * >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> +static void init_group_queue_entity(struct blkio_cgroup *blkcg, >> + struct cfq_group *cfqg) >> +{ >> + struct io_sched_entity *gse = &cfqg->group_se; >> + struct io_sched_entity *qse = &cfqg->queue_se; >> + >> + gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >> + RB_CLEAR_NODE(&gse->rb_node); >> + gse->is_group_se = true; >> + gse->on_st = false; >> + >> + /* Set to 500 for the time being */ >> + qse->weight = 500; >> + RB_CLEAR_NODE(&qse->rb_node); >> + qse->is_group_se = false; >> + qse->on_st = false; >> +} >> + >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, >> + struct cfq_group *cfqg) >> { >> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> - struct cfq_group *cfqg = NULL; >> - void *key = cfqd; >> int i, j; >> struct cfq_rb_root *st; >> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> unsigned int major, minor; >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> >> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> - cfqg->blkg.dev = MKDEV(major, minor); >> - goto done; >> - } >> - if (cfqg || !create) >> - goto done; >> - >> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >> - if (!cfqg) >> - goto done; >> + cfqg->grp_service_tree = CFQ_RB_ROOT; >> >> for_each_cfqg_st(cfqg, i, j, st) >> *st = CFQ_RB_ROOT; >> - RB_CLEAR_NODE(&cfqg->rb_node); >> >> /* >> * Take the initial reference that will be released on destroy >> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, >> MKDEV(major, minor)); >> - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >> - >> + init_group_queue_entity(blkcg, cfqg); >> /* Add group on cfqd list */ >> hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); >> +} >> + >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> +{ >> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) >> + cfq_destroy_cfqg(cfqd, cfqg); >> +} >> + >> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg) >> +{ >> + struct io_sched_entity *gse = &cfqg->group_se; >> + struct io_sched_entity *qse = &cfqg->queue_se; >> + struct io_sched_entity *p_gse = &p_cfqg->group_se; >> + >> + gse->parent = p_gse; >> + gse->st = &p_cfqg->grp_service_tree; >> + >> + qse->parent = gse; >> + qse->st = &cfqg->grp_service_tree; >> + >> + /* child reference */ >> + atomic_inc(&p_cfqg->ref); >> +} >> + >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) >> +{ >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> + struct blkio_cgroup *p_blkcg; >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> + unsigned int major, minor; >> + struct cfq_group *cfqg, *p_cfqg; >> + void *key = cfqd; >> + int ret; >> + >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> + if (cfqg) { >> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> + cfqg->blkg.dev = MKDEV(major, minor); >> + } >> + /* chain has already been built */ >> + return 0; >> + } >> + >> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >> + if (!cfqg) >> + return -1; >> + >> + init_cfqg(cfqd, blkcg, cfqg); >> + >> + /* Already to the top group */ >> + if (!cgroup->parent) >> + return 0; >> + >> + ret = cfqg_chain_alloc(cfqd, cgroup->parent); >> + if (ret == -1) { >> + uninit_cfqg(cfqd, cfqg); >> + return -1; >> + } >> + >> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); >> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key)); >> + BUG_ON(p_cfqg == NULL); >> + >> + cfqg_set_parent(cfqg, p_cfqg); >> + return 0; >> +} >> + >> +static struct cfq_group * >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> +{ >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> + struct cfq_group *cfqg = NULL; >> + void *key = cfqd; >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> + unsigned int major, minor; >> + int ret; >> >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> + cfqg->blkg.dev = MKDEV(major, minor); >> + goto done; >> + } >> + if (cfqg || !create) >> + goto done; >> + >> + ret = cfqg_chain_alloc(cfqd, cgroup); >> + if (!ret) { >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> + BUG_ON(cfqg == NULL); >> + goto done; >> + } >> done: >> return cfqg; >> } >> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) >> atomic_inc(&cfqq->cfqg->ref); >> } >> >> -static void cfq_put_cfqg(struct cfq_group *cfqg) >> -{ >> - struct cfq_rb_root *st; >> - int i, j; >> - >> - BUG_ON(atomic_read(&cfqg->ref) <= 0); >> - if (!atomic_dec_and_test(&cfqg->ref)) >> - return; >> - for_each_cfqg_st(cfqg, i, j, st) >> - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >> - kfree(cfqg); >> -} >> - >> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> -{ >> - /* Something wrong if we are trying to remove same group twice */ >> - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >> - >> - hlist_del_init(&cfqg->cfqd_node); >> - >> - /* >> - * Put the reference taken at the time of creation so that when all >> - * queues are gone, group can be destroyed. >> - */ >> - cfq_put_cfqg(cfqg); >> -} >> - >> static void cfq_release_cfq_groups(struct cfq_data *cfqd) >> { >> struct hlist_node *pos, *n; >> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, >> if (cfqq == cfqd->active_queue) >> cfqd->active_queue = NULL; >> >> - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) >> - cfqd->grp_service_tree.active = NULL; >> - >> if (cfqd->active_cic) { >> put_io_context(cfqd->active_cic->ioc); >> cfqd->active_cic = NULL; >> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) >> cfqd->noidle_tree_requires_idle = false; >> } >> >> + >> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) >> { >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> + struct cfq_group *root_group = &cfqd->root_group; >> + struct cfq_rb_root *st = &root_group->grp_service_tree; >> struct cfq_group *cfqg; >> + struct io_sched_entity *se; >> >> - if (RB_EMPTY_ROOT(&st->rb)) >> - return NULL; >> - cfqg = cfq_rb_first_group(st); >> - st->active = &cfqg->rb_node; >> - update_min_vdisktime(st); >> - return cfqg; >> + do { >> + se = cfq_rb_first_se(st); >> + if (!se) >> + return NULL; >> + st->active = &se->rb_node; >> + update_min_vdisktime(st); >> + cfqg = cfqg_of_qse(se); >> + if (cfqg) >> + return cfqg; >> + cfqg = cfqg_of_gse(se); >> + st = &cfqg->grp_service_tree; >> + } while (1); >> } >> >> static void cfq_choose_cfqg(struct cfq_data *cfqd) >> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> if (!cfqq) >> goto new_queue; >> >> + >> if (!cfqd->rq_queued) >> return NULL; >> >> + >> /* >> * We were waiting for group to get backlogged. Expire the queue >> */ >> if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) >> goto expire; >> >> + >> /* >> * The active queue has run out of time, expire it and select new. >> */ >> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> goto check_group_idle; >> } >> >> + >> /* >> * The active queue has requests and isn't expired, allow it to >> * dispatch. >> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> if (!RB_EMPTY_ROOT(&cfqq->sort_list)) >> goto keep_queue; >> >> + >> /* >> * If another queue has a request waiting within our mean seek >> * distance, let it run. The expire code will check for close >> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) >> } >> >> cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); >> + >> return 1; >> } >> >> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q) >> >> cfqd->cic_index = i; >> >> - /* Init root service tree */ >> - cfqd->grp_service_tree = CFQ_RB_ROOT; >> - >> /* Init root group */ >> cfqg = &cfqd->root_group; >> + cfqg->grp_service_tree = CFQ_RB_ROOT; >> for_each_cfqg_st(cfqg, i, j, st) >> *st = CFQ_RB_ROOT; >> - RB_CLEAR_NODE(&cfqg->rb_node); >> - >> + cfqg->group_se.is_group_se = true; >> + RB_CLEAR_NODE(&cfqg->group_se.rb_node); >> + cfqg->group_se.on_st = false; >> /* Give preference to root group over other groups */ >> - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; >> + cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >> + cfqg->group_se.parent = NULL; >> + cfqg->group_se.st = NULL; >> + >> + cfqg->queue_se.is_group_se = false; >> + RB_CLEAR_NODE(&cfqg->queue_se.rb_node); >> + cfqg->queue_se.on_st = false; >> + cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >> + cfqg->queue_se.parent = &cfqg->group_se; >> + cfqg->queue_se.st = &cfqg->grp_service_tree; >> >> #ifdef CONFIG_CFQ_GROUP_IOSCHED >> /* >> -- >> 1.6.5.2 >> >> > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-31 0:29 ` Gui Jianfeng @ 2010-08-31 12:57 ` Vivek Goyal 2010-08-31 15:40 ` Nauman Rafique 2010-09-01 8:48 ` Gui Jianfeng 0 siblings, 2 replies; 19+ messages in thread From: Vivek Goyal @ 2010-08-31 12:57 UTC (permalink / raw) To: Gui Jianfeng Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: > Vivek Goyal wrote: > > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > >> Hi All, > >> > >> This patch enables cfq group hierarchical scheduling. > >> > >> With this patch, you can create a cgroup directory deeper than level 1. > >> Now, I/O Bandwidth is distributed in a hierarchy way. For example: > >> We create cgroup directories as following(the number represents weight): > >> > >> Root grp > >> / \ > >> grp_1(100) grp_2(400) > >> / \ > >> grp_3(200) grp_4(300) > >> > >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > >> grp_2 will share 80% of total bandwidth. > >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > >> > >> Design: > >> o Each cfq group has its own group service tree. > >> o Each cfq group contains a "group schedule entity" (gse) that > >> schedules on parent cfq group's service tree. > >> o Each cfq group contains a "queue schedule entity"(qse), it > >> represents all cfqqs located on this cfq group. It schedules > >> on this group's service tree. For the time being, root group > >> qse's weight is 1000, and subgroup qse's weight is 500. > >> o All gses and qse which belones to a same cfq group schedules > >> on the same group service tree. > > > > Hi Gui, > > > > Thanks for the patch. I have few questions. > > > > - So how does the hierarchy look like, w.r.t root group. Something as > > follows? > > > > > > root > > / | \ > > q1 q2 G1 > > > > Assume there are two processes doin IO in root group and q1 and q2 are > > cfqq queues for those processes and G1 is the cgroup created by user. > > > > If yes, then what algorithm do you use to do scheduling between q1, q2 > > and G1? IOW, currently we have two algorithms operating in CFQ. One for > > cfqq and other for groups. Group algorithm does not use the logic of > > cfq_slice_offset(). > > Hi Vivek, > > This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. > If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and > it will schedule on root group service with G1, as following: > > root group > / \ > qse(q1,q2) gse(G1) > Ok. That's interesting. That raises another question that how hierarchy should look like. IOW, how queue and groups should be treated in hierarchy. CFS cpu scheduler treats queues and group at the same level. That is as follows. root / | \ q1 q2 G1 In the past I had raised this question and Jens and corrado liked treating queues and group at same level. Logically, q1, q2 and G1 are all children of root, so it makes sense to treat them at same level and not group q1 and q2 in to a single entity and group. One of the possible way forward could be this. - Treat queue and group at same level (like CFS) - Get rid of cfq_slice_offset() logic. That means without idling on, there will be no ioprio difference between cfq queues. I think anyway as of today that logic helps in so little situations that I would not mind getting rid of it. Just that Jens should agree to it. - With this new scheme, it will break the existing semantics of root group being at same level as child groups. To avoid that, we can probably implement two modes (flat and hierarchical), something similar to what memory cgroup controller has done. May be one tunable in root cgroup of blkio "use_hierarchy". By default everything will be in flat mode and if user wants hiearchical control, he needs to set user_hierarchy in root group. I think memory controller provides "use_hierarchy" tunable in each cgroup. I am not sure why do we need it in each cgroup and not just in root cgroup. Thanks Vivek > Thanks, > Gui > > > > > The reason being they are different because CFQ wanted to provide ioprio > > service differentiation on SSD where slice idle is effectively zero. But > > that slice_offset() logic is approximate at best and is not suitable > > for group scheduling. > > > > I have yet to go through the patches in detail, so thought will ask you > > that how have you handeled above. > > > > Vivek > > > >> o cfq group allocates in a recursive manner, that means when a cfq > >> group needs to be allocated, the upper level cfq groups are also > >> allocated. > >> o When a cfq group served, not only charge this cfq group but also > >> charge its ancestors. > >> block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++------------- > >> 2 files changed, 353 insertions(+), 128 deletions(-) > >> > >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > >> index 2fef1ef..aa06cb3 100644 > >> --- a/block/blk-cgroup.c > >> +++ b/block/blk-cgroup.c > >> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) > >> goto done; > >> } > >> > >> - /* Currently we do not support hierarchy deeper than two level (0,1) */ > >> - if (parent != cgroup->top_cgroup) > >> - return ERR_PTR(-EPERM); > >> - > >> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); > >> if (!blkcg) > >> return ERR_PTR(-ENOMEM); > >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c > >> index f65c6f0..4660d89 100644 > >> --- a/block/cfq-iosched.c > >> +++ b/block/cfq-iosched.c > >> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida); > >> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) > >> > >> #define sample_valid(samples) ((samples) > 80) > >> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) > >> +#define rb_entry_se(node) \ > >> + rb_entry((node), struct io_sched_entity, rb_node) > >> > >> /* > >> * Most of our rbtree usage is for sorting with min extraction, so > >> @@ -171,19 +172,40 @@ enum wl_type_t { > >> SYNC_WORKLOAD = 2 > >> }; > >> > >> -/* This is per cgroup per device grouping structure */ > >> -struct cfq_group { > >> +/* > >> + * This's the schedule entity which is scheduled on group service tree. > >> + * It represents shedule structure of a cfq group, or a bundle of cfq > >> + * queues that locate in a cfq group. > >> + */ > >> +struct io_sched_entity { > >> + struct cfq_rb_root *st; > >> + > >> /* group service_tree member */ > >> struct rb_node rb_node; > >> - > >> - /* group service_tree key */ > >> u64 vdisktime; > >> unsigned int weight; > >> bool on_st; > >> + bool is_group_se; > >> + struct io_sched_entity *parent; > >> +}; > >> + > >> +/* This is per cgroup per device grouping structure */ > >> +struct cfq_group { > >> + /* cfq group sched entity */ > >> + struct io_sched_entity group_se; > >> + > >> + /* sched entity for cfqqs */ > >> + struct io_sched_entity queue_se; > >> + > >> + /* Service tree for cfq_groups and cfqqs set*/ > >> + struct cfq_rb_root grp_service_tree; > >> > >> /* number of cfqq currently on this group */ > >> int nr_cfqq; > >> > >> + /* number of sub cfq groups */ > >> + int nr_subgp; > >> + > >> /* Per group busy queus average. Useful for workload slice calc. */ > >> unsigned int busy_queues_avg[2]; > >> /* > >> @@ -210,8 +232,6 @@ struct cfq_group { > >> */ > >> struct cfq_data { > >> struct request_queue *queue; > >> - /* Root service tree for cfq_groups */ > >> - struct cfq_rb_root grp_service_tree; > >> struct cfq_group root_group; > >> > >> /* > >> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd) > >> return false; > >> } > >> > >> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se) > >> +{ > >> + if (se->is_group_se) > >> + return container_of(se, struct cfq_group, group_se); > >> + else > >> + return NULL; > >> +} > >> + > >> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se) > >> +{ > >> + if (!se->is_group_se) > >> + return container_of(se, struct cfq_group, queue_se); > >> + else > >> + return NULL; > >> +} > >> + > >> static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) > >> { > >> if (cfq_class_idle(cfqq)) > >> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) > >> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); > >> } > >> > >> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) > >> +static inline u64 cfq_scale_slice(unsigned long delta, > >> + struct io_sched_entity *se) > >> { > >> u64 d = delta << CFQ_SERVICE_SHIFT; > >> > >> d = d * BLKIO_WEIGHT_DEFAULT; > >> - do_div(d, cfqg->weight); > >> + do_div(d, se->weight); > >> return d; > >> } > >> > >> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) > >> static void update_min_vdisktime(struct cfq_rb_root *st) > >> { > >> u64 vdisktime = st->min_vdisktime; > >> - struct cfq_group *cfqg; > >> + struct io_sched_entity *se; > >> > >> if (st->active) { > >> - cfqg = rb_entry_cfqg(st->active); > >> - vdisktime = cfqg->vdisktime; > >> + se = rb_entry_se(st->active); > >> + vdisktime = se->vdisktime; > >> } > >> > >> if (st->left) { > >> - cfqg = rb_entry_cfqg(st->left); > >> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); > >> + se = rb_entry_se(st->left); > >> + vdisktime = min_vdisktime(vdisktime, se->vdisktime); > >> } > >> > >> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); > >> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, > >> static inline unsigned > >> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> { > >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; > >> + struct io_sched_entity *gse = &cfqg->queue_se; > >> + struct cfq_rb_root *st = gse->st; > >> > >> - return cfq_target_latency * cfqg->weight / st->total_weight; > >> + return cfq_target_latency * gse->weight / st->total_weight; > >> } > >> > >> static inline void > >> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) > >> return NULL; > >> } > >> > >> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) > >> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root) > >> { > >> if (!root->left) > >> root->left = rb_first(&root->rb); > >> > >> if (root->left) > >> - return rb_entry_cfqg(root->left); > >> + return rb_entry_se(root->left); > >> > >> return NULL; > >> } > >> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, > >> } > >> > >> static inline s64 > >> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) > >> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se) > >> { > >> - return cfqg->vdisktime - st->min_vdisktime; > >> + return se->vdisktime - st->min_vdisktime; > >> } > >> > >> static void > >> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) > >> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se) > >> { > >> struct rb_node **node = &st->rb.rb_node; > >> struct rb_node *parent = NULL; > >> - struct cfq_group *__cfqg; > >> - s64 key = cfqg_key(st, cfqg); > >> + struct io_sched_entity *__se; > >> + s64 key = se_key(st, se); > >> int left = 1; > >> > >> while (*node != NULL) { > >> parent = *node; > >> - __cfqg = rb_entry_cfqg(parent); > >> + __se = rb_entry_se(parent); > >> > >> - if (key < cfqg_key(st, __cfqg)) > >> + if (key < se_key(st, __se)) > >> node = &parent->rb_left; > >> else { > >> node = &parent->rb_right; > >> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) > >> } > >> > >> if (left) > >> - st->left = &cfqg->rb_node; > >> + st->left = &se->rb_node; > >> > >> - rb_link_node(&cfqg->rb_node, parent, node); > >> - rb_insert_color(&cfqg->rb_node, &st->rb); > >> + rb_link_node(&se->rb_node, parent, node); > >> + rb_insert_color(&se->rb_node, &st->rb); > >> } > >> > >> -static void > >> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> +static void io_sched_entity_add(struct cfq_rb_root *st, > >> + struct io_sched_entity *se) > >> { > >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; > >> - struct cfq_group *__cfqg; > >> struct rb_node *n; > >> + struct io_sched_entity *__se; > >> > >> - cfqg->nr_cfqq++; > >> - if (cfqg->on_st) > >> - return; > >> > >> + if (se->on_st) > >> + return; > >> /* > >> * Currently put the group at the end. Later implement something > >> * so that groups get lesser vtime based on their weights, so that > >> * if group does not loose all if it was not continously backlogged. > >> */ > >> - n = rb_last(&st->rb); > >> + n = rb_last(&se->st->rb); > >> if (n) { > >> - __cfqg = rb_entry_cfqg(n); > >> - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; > >> + __se = rb_entry_se(n); > >> + se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY; > >> } else > >> - cfqg->vdisktime = st->min_vdisktime; > >> + se->vdisktime = st->min_vdisktime; > >> > >> - __cfq_group_service_tree_add(st, cfqg); > >> - cfqg->on_st = true; > >> - st->total_weight += cfqg->weight; > >> + __io_sched_entity_add(se->st, se); > >> + se->on_st = true; > >> + st->total_weight += se->weight; > >> +} > >> + > >> +static void > >> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> +{ > >> + struct io_sched_entity *gse = &cfqg->group_se; > >> + struct io_sched_entity *qse = &cfqg->queue_se; > >> + struct cfq_group *__cfqg; > >> + > >> + cfqg->nr_cfqq++; > >> + > >> + io_sched_entity_add(qse->st, qse); > >> + > >> + while (gse && gse->parent) { > >> + if (gse->on_st) > >> + return; > >> + io_sched_entity_add(gse->st, gse); > >> + gse = gse->parent; > >> + __cfqg = cfqg_of_gse(gse); > >> + __cfqg->nr_subgp++; > >> + } > >> +} > >> + > >> +static void io_sched_entity_del(struct io_sched_entity *se) > >> +{ > >> + if (!RB_EMPTY_NODE(&se->rb_node)) > >> + cfq_rb_erase(&se->rb_node, se->st); > >> + > >> + se->on_st = false; > >> + se->st->total_weight -= se->weight; > >> } > >> > >> static void > >> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> { > >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; > >> + struct io_sched_entity *gse = &cfqg->group_se; > >> + struct io_sched_entity *qse = &cfqg->queue_se; > >> + struct cfq_group *__cfqg, *p_cfqg; > >> + > >> + if (gse->st && gse->st->active == &gse->rb_node) > >> + gse->st->active = NULL; > >> > >> - if (st->active == &cfqg->rb_node) > >> - st->active = NULL; > >> + if (qse->st && qse->st->active == &qse->rb_node) > >> + qse->st->active = NULL; > >> > >> BUG_ON(cfqg->nr_cfqq < 1); > >> cfqg->nr_cfqq--; > >> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> if (cfqg->nr_cfqq) > >> return; > >> > >> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); > >> - cfqg->on_st = false; > >> - st->total_weight -= cfqg->weight; > >> - if (!RB_EMPTY_NODE(&cfqg->rb_node)) > >> - cfq_rb_erase(&cfqg->rb_node, st); > >> - cfqg->saved_workload_slice = 0; > >> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); > >> + /* dequeue queue se from group */ > >> + io_sched_entity_del(qse); > >> + > >> + if (cfqg->nr_subgp) > >> + return; > >> + > >> + /* prevent from dequeuing root group */ > >> + while (gse && gse->parent) { > >> + __cfqg = cfqg_of_gse(gse); > >> + p_cfqg = cfqg_of_gse(gse->parent); > >> + io_sched_entity_del(gse); > >> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); > >> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); > >> + __cfqg->saved_workload_slice = 0; > >> + gse = gse->parent; > >> + p_cfqg->nr_subgp--; > >> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) > >> + return; > >> + } > >> } > >> > >> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) > >> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) > >> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > >> struct cfq_queue *cfqq) > >> { > >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; > >> + struct io_sched_entity *gse = &cfqg->group_se; > >> + struct io_sched_entity *qse = &cfqg->queue_se; > >> unsigned int used_sl, charge; > >> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) > >> - cfqg->service_tree_idle.count; > >> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > >> else if (!cfq_cfqq_sync(cfqq) && !nr_sync) > >> charge = cfqq->allocated_slice; > >> > >> - /* Can't update vdisktime while group is on service tree */ > >> - cfq_rb_erase(&cfqg->rb_node, st); > >> - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); > >> - __cfq_group_service_tree_add(st, cfqg); > >> + /* > >> + * update queue se's vdisktime. > >> + * Can't update vdisktime while group is on service tree. > >> + */ > >> + > >> + cfq_rb_erase(&qse->rb_node, qse->st); > >> + qse->vdisktime += cfq_scale_slice(charge, qse); > >> + __io_sched_entity_add(qse->st, qse); > >> + if (&qse->rb_node == qse->st->active) > >> + qse->st->active = NULL; > >> + > >> + while (gse && gse->parent) { > >> + cfq_rb_erase(&gse->rb_node, gse->st); > >> + gse->vdisktime += cfq_scale_slice(charge, gse); > >> + __io_sched_entity_add(gse->st, gse); > >> + if (&gse->rb_node == gse->st->active) > >> + gse->st->active = NULL; > >> + gse = gse->parent; > >> + } > >> > >> /* This group is being expired. Save the context */ > >> if (time_after(cfqd->workload_expires, jiffies)) { > >> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, > >> } else > >> cfqg->saved_workload_slice = 0; > >> > >> - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, > >> - st->min_vdisktime); > >> + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime, > >> + gse->st->min_vdisktime); > >> + > >> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" > >> " sect=%u", used_sl, cfqq->slice_dispatch, charge, > >> iops_mode(cfqd), cfqq->nr_sectors); > >> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) > >> return NULL; > >> } > >> > >> +static void cfq_put_cfqg(struct cfq_group *cfqg) > >> +{ > >> + struct cfq_rb_root *st; > >> + int i, j; > >> + struct io_sched_entity *gse; > >> + struct cfq_group *p_cfqg; > >> + > >> + BUG_ON(atomic_read(&cfqg->ref) <= 0); > >> + if (!atomic_dec_and_test(&cfqg->ref)) > >> + return; > >> + for_each_cfqg_st(cfqg, i, j, st) > >> + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); > >> + > >> + gse = &cfqg->group_se; > >> + if (gse->parent) { > >> + p_cfqg = cfqg_of_gse(gse->parent); > >> + /* Drop the reference taken by children */ > >> + atomic_dec(&p_cfqg->ref); > >> + } > >> + > >> + kfree(cfqg); > >> +} > >> + > >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> +{ > >> + /* Something wrong if we are trying to remove same group twice */ > >> + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); > >> + > >> + hlist_del_init(&cfqg->cfqd_node); > >> + > >> + /* > >> + * Put the reference taken at the time of creation so that when all > >> + * queues are gone, group can be destroyed. > >> + */ > >> + cfq_put_cfqg(cfqg); > >> +} > >> + > >> void > >> cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) > >> { > >> - cfqg_of_blkg(blkg)->weight = weight; > >> + struct cfq_group *cfqg; > >> + struct io_sched_entity *gse; > >> + > >> + cfqg = cfqg_of_blkg(blkg); > >> + gse = &cfqg->group_se; > >> + gse->weight = weight; > >> } > >> > >> -static struct cfq_group * > >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > >> +static void init_group_queue_entity(struct blkio_cgroup *blkcg, > >> + struct cfq_group *cfqg) > >> +{ > >> + struct io_sched_entity *gse = &cfqg->group_se; > >> + struct io_sched_entity *qse = &cfqg->queue_se; > >> + > >> + gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); > >> + RB_CLEAR_NODE(&gse->rb_node); > >> + gse->is_group_se = true; > >> + gse->on_st = false; > >> + > >> + /* Set to 500 for the time being */ > >> + qse->weight = 500; > >> + RB_CLEAR_NODE(&qse->rb_node); > >> + qse->is_group_se = false; > >> + qse->on_st = false; > >> +} > >> + > >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, > >> + struct cfq_group *cfqg) > >> { > >> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > >> - struct cfq_group *cfqg = NULL; > >> - void *key = cfqd; > >> int i, j; > >> struct cfq_rb_root *st; > >> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > >> unsigned int major, minor; > >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > >> > >> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > >> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > >> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > >> - cfqg->blkg.dev = MKDEV(major, minor); > >> - goto done; > >> - } > >> - if (cfqg || !create) > >> - goto done; > >> - > >> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); > >> - if (!cfqg) > >> - goto done; > >> + cfqg->grp_service_tree = CFQ_RB_ROOT; > >> > >> for_each_cfqg_st(cfqg, i, j, st) > >> *st = CFQ_RB_ROOT; > >> - RB_CLEAR_NODE(&cfqg->rb_node); > >> > >> /* > >> * Take the initial reference that will be released on destroy > >> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > >> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > >> cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, > >> MKDEV(major, minor)); > >> - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); > >> - > >> + init_group_queue_entity(blkcg, cfqg); > >> /* Add group on cfqd list */ > >> hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); > >> +} > >> + > >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> +{ > >> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) > >> + cfq_destroy_cfqg(cfqd, cfqg); > >> +} > >> + > >> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg) > >> +{ > >> + struct io_sched_entity *gse = &cfqg->group_se; > >> + struct io_sched_entity *qse = &cfqg->queue_se; > >> + struct io_sched_entity *p_gse = &p_cfqg->group_se; > >> + > >> + gse->parent = p_gse; > >> + gse->st = &p_cfqg->grp_service_tree; > >> + > >> + qse->parent = gse; > >> + qse->st = &cfqg->grp_service_tree; > >> + > >> + /* child reference */ > >> + atomic_inc(&p_cfqg->ref); > >> +} > >> + > >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) > >> +{ > >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > >> + struct blkio_cgroup *p_blkcg; > >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > >> + unsigned int major, minor; > >> + struct cfq_group *cfqg, *p_cfqg; > >> + void *key = cfqd; > >> + int ret; > >> + > >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > >> + if (cfqg) { > >> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > >> + cfqg->blkg.dev = MKDEV(major, minor); > >> + } > >> + /* chain has already been built */ > >> + return 0; > >> + } > >> + > >> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); > >> + if (!cfqg) > >> + return -1; > >> + > >> + init_cfqg(cfqd, blkcg, cfqg); > >> + > >> + /* Already to the top group */ > >> + if (!cgroup->parent) > >> + return 0; > >> + > >> + ret = cfqg_chain_alloc(cfqd, cgroup->parent); > >> + if (ret == -1) { > >> + uninit_cfqg(cfqd, cfqg); > >> + return -1; > >> + } > >> + > >> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); > >> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key)); > >> + BUG_ON(p_cfqg == NULL); > >> + > >> + cfqg_set_parent(cfqg, p_cfqg); > >> + return 0; > >> +} > >> + > >> +static struct cfq_group * > >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) > >> +{ > >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); > >> + struct cfq_group *cfqg = NULL; > >> + void *key = cfqd; > >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; > >> + unsigned int major, minor; > >> + int ret; > >> > >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > >> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { > >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); > >> + cfqg->blkg.dev = MKDEV(major, minor); > >> + goto done; > >> + } > >> + if (cfqg || !create) > >> + goto done; > >> + > >> + ret = cfqg_chain_alloc(cfqd, cgroup); > >> + if (!ret) { > >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); > >> + BUG_ON(cfqg == NULL); > >> + goto done; > >> + } > >> done: > >> return cfqg; > >> } > >> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) > >> atomic_inc(&cfqq->cfqg->ref); > >> } > >> > >> -static void cfq_put_cfqg(struct cfq_group *cfqg) > >> -{ > >> - struct cfq_rb_root *st; > >> - int i, j; > >> - > >> - BUG_ON(atomic_read(&cfqg->ref) <= 0); > >> - if (!atomic_dec_and_test(&cfqg->ref)) > >> - return; > >> - for_each_cfqg_st(cfqg, i, j, st) > >> - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); > >> - kfree(cfqg); > >> -} > >> - > >> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> -{ > >> - /* Something wrong if we are trying to remove same group twice */ > >> - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); > >> - > >> - hlist_del_init(&cfqg->cfqd_node); > >> - > >> - /* > >> - * Put the reference taken at the time of creation so that when all > >> - * queues are gone, group can be destroyed. > >> - */ > >> - cfq_put_cfqg(cfqg); > >> -} > >> - > >> static void cfq_release_cfq_groups(struct cfq_data *cfqd) > >> { > >> struct hlist_node *pos, *n; > >> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, > >> if (cfqq == cfqd->active_queue) > >> cfqd->active_queue = NULL; > >> > >> - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) > >> - cfqd->grp_service_tree.active = NULL; > >> - > >> if (cfqd->active_cic) { > >> put_io_context(cfqd->active_cic->ioc); > >> cfqd->active_cic = NULL; > >> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) > >> cfqd->noidle_tree_requires_idle = false; > >> } > >> > >> + > >> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) > >> { > >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; > >> + struct cfq_group *root_group = &cfqd->root_group; > >> + struct cfq_rb_root *st = &root_group->grp_service_tree; > >> struct cfq_group *cfqg; > >> + struct io_sched_entity *se; > >> > >> - if (RB_EMPTY_ROOT(&st->rb)) > >> - return NULL; > >> - cfqg = cfq_rb_first_group(st); > >> - st->active = &cfqg->rb_node; > >> - update_min_vdisktime(st); > >> - return cfqg; > >> + do { > >> + se = cfq_rb_first_se(st); > >> + if (!se) > >> + return NULL; > >> + st->active = &se->rb_node; > >> + update_min_vdisktime(st); > >> + cfqg = cfqg_of_qse(se); > >> + if (cfqg) > >> + return cfqg; > >> + cfqg = cfqg_of_gse(se); > >> + st = &cfqg->grp_service_tree; > >> + } while (1); > >> } > >> > >> static void cfq_choose_cfqg(struct cfq_data *cfqd) > >> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > >> if (!cfqq) > >> goto new_queue; > >> > >> + > >> if (!cfqd->rq_queued) > >> return NULL; > >> > >> + > >> /* > >> * We were waiting for group to get backlogged. Expire the queue > >> */ > >> if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) > >> goto expire; > >> > >> + > >> /* > >> * The active queue has run out of time, expire it and select new. > >> */ > >> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > >> goto check_group_idle; > >> } > >> > >> + > >> /* > >> * The active queue has requests and isn't expired, allow it to > >> * dispatch. > >> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) > >> if (!RB_EMPTY_ROOT(&cfqq->sort_list)) > >> goto keep_queue; > >> > >> + > >> /* > >> * If another queue has a request waiting within our mean seek > >> * distance, let it run. The expire code will check for close > >> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) > >> } > >> > >> cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); > >> + > >> return 1; > >> } > >> > >> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q) > >> > >> cfqd->cic_index = i; > >> > >> - /* Init root service tree */ > >> - cfqd->grp_service_tree = CFQ_RB_ROOT; > >> - > >> /* Init root group */ > >> cfqg = &cfqd->root_group; > >> + cfqg->grp_service_tree = CFQ_RB_ROOT; > >> for_each_cfqg_st(cfqg, i, j, st) > >> *st = CFQ_RB_ROOT; > >> - RB_CLEAR_NODE(&cfqg->rb_node); > >> - > >> + cfqg->group_se.is_group_se = true; > >> + RB_CLEAR_NODE(&cfqg->group_se.rb_node); > >> + cfqg->group_se.on_st = false; > >> /* Give preference to root group over other groups */ > >> - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; > >> + cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT; > >> + cfqg->group_se.parent = NULL; > >> + cfqg->group_se.st = NULL; > >> + > >> + cfqg->queue_se.is_group_se = false; > >> + RB_CLEAR_NODE(&cfqg->queue_se.rb_node); > >> + cfqg->queue_se.on_st = false; > >> + cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT; > >> + cfqg->queue_se.parent = &cfqg->group_se; > >> + cfqg->queue_se.st = &cfqg->grp_service_tree; > >> > >> #ifdef CONFIG_CFQ_GROUP_IOSCHED > >> /* > >> -- > >> 1.6.5.2 > >> > >> > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-31 12:57 ` Vivek Goyal @ 2010-08-31 15:40 ` Nauman Rafique 2010-08-31 19:25 ` Vivek Goyal 2010-09-01 8:48 ` Gui Jianfeng 1 sibling, 1 reply; 19+ messages in thread From: Nauman Rafique @ 2010-08-31 15:40 UTC (permalink / raw) To: Vivek Goyal Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >> Vivek Goyal wrote: >> > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >> >> Hi All, >> >> >> >> This patch enables cfq group hierarchical scheduling. >> >> >> >> With this patch, you can create a cgroup directory deeper than level 1. >> >> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >> >> We create cgroup directories as following(the number represents weight): >> >> >> >> Root grp >> >> / \ >> >> grp_1(100) grp_2(400) >> >> / \ >> >> grp_3(200) grp_4(300) >> >> >> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >> >> grp_2 will share 80% of total bandwidth. >> >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >> >> >> >> Design: >> >> o Each cfq group has its own group service tree. >> >> o Each cfq group contains a "group schedule entity" (gse) that >> >> schedules on parent cfq group's service tree. >> >> o Each cfq group contains a "queue schedule entity"(qse), it >> >> represents all cfqqs located on this cfq group. It schedules >> >> on this group's service tree. For the time being, root group >> >> qse's weight is 1000, and subgroup qse's weight is 500. >> >> o All gses and qse which belones to a same cfq group schedules >> >> on the same group service tree. >> > >> > Hi Gui, >> > >> > Thanks for the patch. I have few questions. >> > >> > - So how does the hierarchy look like, w.r.t root group. Something as >> > follows? >> > >> > >> > root >> > / | \ >> > q1 q2 G1 >> > >> > Assume there are two processes doin IO in root group and q1 and q2 are >> > cfqq queues for those processes and G1 is the cgroup created by user. >> > >> > If yes, then what algorithm do you use to do scheduling between q1, q2 >> > and G1? IOW, currently we have two algorithms operating in CFQ. One for >> > cfqq and other for groups. Group algorithm does not use the logic of >> > cfq_slice_offset(). >> >> Hi Vivek, >> >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >> it will schedule on root group service with G1, as following: >> >> root group >> / \ >> qse(q1,q2) gse(G1) >> > > Ok. That's interesting. That raises another question that how hierarchy > should look like. IOW, how queue and groups should be treated in > hierarchy. > > CFS cpu scheduler treats queues and group at the same level. That is as > follows. > > root > / | \ > q1 q2 G1 > > In the past I had raised this question and Jens and corrado liked treating > queues and group at same level. > > Logically, q1, q2 and G1 are all children of root, so it makes sense to > treat them at same level and not group q1 and q2 in to a single entity and > group. > > One of the possible way forward could be this. > > - Treat queue and group at same level (like CFS) > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > will be no ioprio difference between cfq queues. I think anyway as of > today that logic helps in so little situations that I would not mind > getting rid of it. Just that Jens should agree to it. > > - With this new scheme, it will break the existing semantics of root group > being at same level as child groups. To avoid that, we can probably > implement two modes (flat and hierarchical), something similar to what > memory cgroup controller has done. May be one tunable in root cgroup of > blkio "use_hierarchy". By default everything will be in flat mode and > if user wants hiearchical control, he needs to set user_hierarchy in > root group. Vivek, may be I am reading you wrong here. But you are first suggesting to add more complexity to treat queues and group at the same level. Then you are suggesting add even more complexity to fix the problems caused by that approach. Why do we need to treat queues and group at the same level? "CFS does it" is not a good argument. Please suggest cases in which it would be useful. > > I think memory controller provides "use_hierarchy" tunable in each > cgroup. I am not sure why do we need it in each cgroup and not just > in root cgroup. > > Thanks > Vivek > >> Thanks, >> Gui >> >> > >> > The reason being they are different because CFQ wanted to provide ioprio >> > service differentiation on SSD where slice idle is effectively zero. But >> > that slice_offset() logic is approximate at best and is not suitable >> > for group scheduling. >> > >> > I have yet to go through the patches in detail, so thought will ask you >> > that how have you handeled above. >> > >> > Vivek >> > >> >> o cfq group allocates in a recursive manner, that means when a cfq >> >> group needs to be allocated, the upper level cfq groups are also >> >> allocated. >> >> o When a cfq group served, not only charge this cfq group but also >> >> charge its ancestors. >> >> block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++------------- >> >> 2 files changed, 353 insertions(+), 128 deletions(-) >> >> >> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c >> >> index 2fef1ef..aa06cb3 100644 >> >> --- a/block/blk-cgroup.c >> >> +++ b/block/blk-cgroup.c >> >> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) >> >> goto done; >> >> } >> >> >> >> - /* Currently we do not support hierarchy deeper than two level (0,1) */ >> >> - if (parent != cgroup->top_cgroup) >> >> - return ERR_PTR(-EPERM); >> >> - >> >> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); >> >> if (!blkcg) >> >> return ERR_PTR(-ENOMEM); >> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c >> >> index f65c6f0..4660d89 100644 >> >> --- a/block/cfq-iosched.c >> >> +++ b/block/cfq-iosched.c >> >> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida); >> >> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) >> >> >> >> #define sample_valid(samples) ((samples) > 80) >> >> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) >> >> +#define rb_entry_se(node) \ >> >> + rb_entry((node), struct io_sched_entity, rb_node) >> >> >> >> /* >> >> * Most of our rbtree usage is for sorting with min extraction, so >> >> @@ -171,19 +172,40 @@ enum wl_type_t { >> >> SYNC_WORKLOAD = 2 >> >> }; >> >> >> >> -/* This is per cgroup per device grouping structure */ >> >> -struct cfq_group { >> >> +/* >> >> + * This's the schedule entity which is scheduled on group service tree. >> >> + * It represents shedule structure of a cfq group, or a bundle of cfq >> >> + * queues that locate in a cfq group. >> >> + */ >> >> +struct io_sched_entity { >> >> + struct cfq_rb_root *st; >> >> + >> >> /* group service_tree member */ >> >> struct rb_node rb_node; >> >> - >> >> - /* group service_tree key */ >> >> u64 vdisktime; >> >> unsigned int weight; >> >> bool on_st; >> >> + bool is_group_se; >> >> + struct io_sched_entity *parent; >> >> +}; >> >> + >> >> +/* This is per cgroup per device grouping structure */ >> >> +struct cfq_group { >> >> + /* cfq group sched entity */ >> >> + struct io_sched_entity group_se; >> >> + >> >> + /* sched entity for cfqqs */ >> >> + struct io_sched_entity queue_se; >> >> + >> >> + /* Service tree for cfq_groups and cfqqs set*/ >> >> + struct cfq_rb_root grp_service_tree; >> >> >> >> /* number of cfqq currently on this group */ >> >> int nr_cfqq; >> >> >> >> + /* number of sub cfq groups */ >> >> + int nr_subgp; >> >> + >> >> /* Per group busy queus average. Useful for workload slice calc. */ >> >> unsigned int busy_queues_avg[2]; >> >> /* >> >> @@ -210,8 +232,6 @@ struct cfq_group { >> >> */ >> >> struct cfq_data { >> >> struct request_queue *queue; >> >> - /* Root service tree for cfq_groups */ >> >> - struct cfq_rb_root grp_service_tree; >> >> struct cfq_group root_group; >> >> >> >> /* >> >> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd) >> >> return false; >> >> } >> >> >> >> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se) >> >> +{ >> >> + if (se->is_group_se) >> >> + return container_of(se, struct cfq_group, group_se); >> >> + else >> >> + return NULL; >> >> +} >> >> + >> >> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se) >> >> +{ >> >> + if (!se->is_group_se) >> >> + return container_of(se, struct cfq_group, queue_se); >> >> + else >> >> + return NULL; >> >> +} >> >> + >> >> static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) >> >> { >> >> if (cfq_class_idle(cfqq)) >> >> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) >> >> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); >> >> } >> >> >> >> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) >> >> +static inline u64 cfq_scale_slice(unsigned long delta, >> >> + struct io_sched_entity *se) >> >> { >> >> u64 d = delta << CFQ_SERVICE_SHIFT; >> >> >> >> d = d * BLKIO_WEIGHT_DEFAULT; >> >> - do_div(d, cfqg->weight); >> >> + do_div(d, se->weight); >> >> return d; >> >> } >> >> >> >> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) >> >> static void update_min_vdisktime(struct cfq_rb_root *st) >> >> { >> >> u64 vdisktime = st->min_vdisktime; >> >> - struct cfq_group *cfqg; >> >> + struct io_sched_entity *se; >> >> >> >> if (st->active) { >> >> - cfqg = rb_entry_cfqg(st->active); >> >> - vdisktime = cfqg->vdisktime; >> >> + se = rb_entry_se(st->active); >> >> + vdisktime = se->vdisktime; >> >> } >> >> >> >> if (st->left) { >> >> - cfqg = rb_entry_cfqg(st->left); >> >> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); >> >> + se = rb_entry_se(st->left); >> >> + vdisktime = min_vdisktime(vdisktime, se->vdisktime); >> >> } >> >> >> >> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); >> >> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, >> >> static inline unsigned >> >> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> { >> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> >> + struct io_sched_entity *gse = &cfqg->queue_se; >> >> + struct cfq_rb_root *st = gse->st; >> >> >> >> - return cfq_target_latency * cfqg->weight / st->total_weight; >> >> + return cfq_target_latency * gse->weight / st->total_weight; >> >> } >> >> >> >> static inline void >> >> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) >> >> return NULL; >> >> } >> >> >> >> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) >> >> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root) >> >> { >> >> if (!root->left) >> >> root->left = rb_first(&root->rb); >> >> >> >> if (root->left) >> >> - return rb_entry_cfqg(root->left); >> >> + return rb_entry_se(root->left); >> >> >> >> return NULL; >> >> } >> >> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, >> >> } >> >> >> >> static inline s64 >> >> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) >> >> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se) >> >> { >> >> - return cfqg->vdisktime - st->min_vdisktime; >> >> + return se->vdisktime - st->min_vdisktime; >> >> } >> >> >> >> static void >> >> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >> >> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se) >> >> { >> >> struct rb_node **node = &st->rb.rb_node; >> >> struct rb_node *parent = NULL; >> >> - struct cfq_group *__cfqg; >> >> - s64 key = cfqg_key(st, cfqg); >> >> + struct io_sched_entity *__se; >> >> + s64 key = se_key(st, se); >> >> int left = 1; >> >> >> >> while (*node != NULL) { >> >> parent = *node; >> >> - __cfqg = rb_entry_cfqg(parent); >> >> + __se = rb_entry_se(parent); >> >> >> >> - if (key < cfqg_key(st, __cfqg)) >> >> + if (key < se_key(st, __se)) >> >> node = &parent->rb_left; >> >> else { >> >> node = &parent->rb_right; >> >> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >> >> } >> >> >> >> if (left) >> >> - st->left = &cfqg->rb_node; >> >> + st->left = &se->rb_node; >> >> >> >> - rb_link_node(&cfqg->rb_node, parent, node); >> >> - rb_insert_color(&cfqg->rb_node, &st->rb); >> >> + rb_link_node(&se->rb_node, parent, node); >> >> + rb_insert_color(&se->rb_node, &st->rb); >> >> } >> >> >> >> -static void >> >> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> +static void io_sched_entity_add(struct cfq_rb_root *st, >> >> + struct io_sched_entity *se) >> >> { >> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> >> - struct cfq_group *__cfqg; >> >> struct rb_node *n; >> >> + struct io_sched_entity *__se; >> >> >> >> - cfqg->nr_cfqq++; >> >> - if (cfqg->on_st) >> >> - return; >> >> >> >> + if (se->on_st) >> >> + return; >> >> /* >> >> * Currently put the group at the end. Later implement something >> >> * so that groups get lesser vtime based on their weights, so that >> >> * if group does not loose all if it was not continously backlogged. >> >> */ >> >> - n = rb_last(&st->rb); >> >> + n = rb_last(&se->st->rb); >> >> if (n) { >> >> - __cfqg = rb_entry_cfqg(n); >> >> - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; >> >> + __se = rb_entry_se(n); >> >> + se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY; >> >> } else >> >> - cfqg->vdisktime = st->min_vdisktime; >> >> + se->vdisktime = st->min_vdisktime; >> >> >> >> - __cfq_group_service_tree_add(st, cfqg); >> >> - cfqg->on_st = true; >> >> - st->total_weight += cfqg->weight; >> >> + __io_sched_entity_add(se->st, se); >> >> + se->on_st = true; >> >> + st->total_weight += se->weight; >> >> +} >> >> + >> >> +static void >> >> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> +{ >> >> + struct io_sched_entity *gse = &cfqg->group_se; >> >> + struct io_sched_entity *qse = &cfqg->queue_se; >> >> + struct cfq_group *__cfqg; >> >> + >> >> + cfqg->nr_cfqq++; >> >> + >> >> + io_sched_entity_add(qse->st, qse); >> >> + >> >> + while (gse && gse->parent) { >> >> + if (gse->on_st) >> >> + return; >> >> + io_sched_entity_add(gse->st, gse); >> >> + gse = gse->parent; >> >> + __cfqg = cfqg_of_gse(gse); >> >> + __cfqg->nr_subgp++; >> >> + } >> >> +} >> >> + >> >> +static void io_sched_entity_del(struct io_sched_entity *se) >> >> +{ >> >> + if (!RB_EMPTY_NODE(&se->rb_node)) >> >> + cfq_rb_erase(&se->rb_node, se->st); >> >> + >> >> + se->on_st = false; >> >> + se->st->total_weight -= se->weight; >> >> } >> >> >> >> static void >> >> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> { >> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> >> + struct io_sched_entity *gse = &cfqg->group_se; >> >> + struct io_sched_entity *qse = &cfqg->queue_se; >> >> + struct cfq_group *__cfqg, *p_cfqg; >> >> + >> >> + if (gse->st && gse->st->active == &gse->rb_node) >> >> + gse->st->active = NULL; >> >> >> >> - if (st->active == &cfqg->rb_node) >> >> - st->active = NULL; >> >> + if (qse->st && qse->st->active == &qse->rb_node) >> >> + qse->st->active = NULL; >> >> >> >> BUG_ON(cfqg->nr_cfqq < 1); >> >> cfqg->nr_cfqq--; >> >> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> if (cfqg->nr_cfqq) >> >> return; >> >> >> >> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); >> >> - cfqg->on_st = false; >> >> - st->total_weight -= cfqg->weight; >> >> - if (!RB_EMPTY_NODE(&cfqg->rb_node)) >> >> - cfq_rb_erase(&cfqg->rb_node, st); >> >> - cfqg->saved_workload_slice = 0; >> >> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); >> >> + /* dequeue queue se from group */ >> >> + io_sched_entity_del(qse); >> >> + >> >> + if (cfqg->nr_subgp) >> >> + return; >> >> + >> >> + /* prevent from dequeuing root group */ >> >> + while (gse && gse->parent) { >> >> + __cfqg = cfqg_of_gse(gse); >> >> + p_cfqg = cfqg_of_gse(gse->parent); >> >> + io_sched_entity_del(gse); >> >> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); >> >> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); >> >> + __cfqg->saved_workload_slice = 0; >> >> + gse = gse->parent; >> >> + p_cfqg->nr_subgp--; >> >> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) >> >> + return; >> >> + } >> >> } >> >> >> >> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >> >> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >> >> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> >> struct cfq_queue *cfqq) >> >> { >> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> >> + struct io_sched_entity *gse = &cfqg->group_se; >> >> + struct io_sched_entity *qse = &cfqg->queue_se; >> >> unsigned int used_sl, charge; >> >> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) >> >> - cfqg->service_tree_idle.count; >> >> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> >> else if (!cfq_cfqq_sync(cfqq) && !nr_sync) >> >> charge = cfqq->allocated_slice; >> >> >> >> - /* Can't update vdisktime while group is on service tree */ >> >> - cfq_rb_erase(&cfqg->rb_node, st); >> >> - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); >> >> - __cfq_group_service_tree_add(st, cfqg); >> >> + /* >> >> + * update queue se's vdisktime. >> >> + * Can't update vdisktime while group is on service tree. >> >> + */ >> >> + >> >> + cfq_rb_erase(&qse->rb_node, qse->st); >> >> + qse->vdisktime += cfq_scale_slice(charge, qse); >> >> + __io_sched_entity_add(qse->st, qse); >> >> + if (&qse->rb_node == qse->st->active) >> >> + qse->st->active = NULL; >> >> + >> >> + while (gse && gse->parent) { >> >> + cfq_rb_erase(&gse->rb_node, gse->st); >> >> + gse->vdisktime += cfq_scale_slice(charge, gse); >> >> + __io_sched_entity_add(gse->st, gse); >> >> + if (&gse->rb_node == gse->st->active) >> >> + gse->st->active = NULL; >> >> + gse = gse->parent; >> >> + } >> >> >> >> /* This group is being expired. Save the context */ >> >> if (time_after(cfqd->workload_expires, jiffies)) { >> >> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >> >> } else >> >> cfqg->saved_workload_slice = 0; >> >> >> >> - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, >> >> - st->min_vdisktime); >> >> + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime, >> >> + gse->st->min_vdisktime); >> >> + >> >> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" >> >> " sect=%u", used_sl, cfqq->slice_dispatch, charge, >> >> iops_mode(cfqd), cfqq->nr_sectors); >> >> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) >> >> return NULL; >> >> } >> >> >> >> +static void cfq_put_cfqg(struct cfq_group *cfqg) >> >> +{ >> >> + struct cfq_rb_root *st; >> >> + int i, j; >> >> + struct io_sched_entity *gse; >> >> + struct cfq_group *p_cfqg; >> >> + >> >> + BUG_ON(atomic_read(&cfqg->ref) <= 0); >> >> + if (!atomic_dec_and_test(&cfqg->ref)) >> >> + return; >> >> + for_each_cfqg_st(cfqg, i, j, st) >> >> + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >> >> + >> >> + gse = &cfqg->group_se; >> >> + if (gse->parent) { >> >> + p_cfqg = cfqg_of_gse(gse->parent); >> >> + /* Drop the reference taken by children */ >> >> + atomic_dec(&p_cfqg->ref); >> >> + } >> >> + >> >> + kfree(cfqg); >> >> +} >> >> + >> >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> +{ >> >> + /* Something wrong if we are trying to remove same group twice */ >> >> + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >> >> + >> >> + hlist_del_init(&cfqg->cfqd_node); >> >> + >> >> + /* >> >> + * Put the reference taken at the time of creation so that when all >> >> + * queues are gone, group can be destroyed. >> >> + */ >> >> + cfq_put_cfqg(cfqg); >> >> +} >> >> + >> >> void >> >> cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) >> >> { >> >> - cfqg_of_blkg(blkg)->weight = weight; >> >> + struct cfq_group *cfqg; >> >> + struct io_sched_entity *gse; >> >> + >> >> + cfqg = cfqg_of_blkg(blkg); >> >> + gse = &cfqg->group_se; >> >> + gse->weight = weight; >> >> } >> >> >> >> -static struct cfq_group * >> >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> >> +static void init_group_queue_entity(struct blkio_cgroup *blkcg, >> >> + struct cfq_group *cfqg) >> >> +{ >> >> + struct io_sched_entity *gse = &cfqg->group_se; >> >> + struct io_sched_entity *qse = &cfqg->queue_se; >> >> + >> >> + gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >> >> + RB_CLEAR_NODE(&gse->rb_node); >> >> + gse->is_group_se = true; >> >> + gse->on_st = false; >> >> + >> >> + /* Set to 500 for the time being */ >> >> + qse->weight = 500; >> >> + RB_CLEAR_NODE(&qse->rb_node); >> >> + qse->is_group_se = false; >> >> + qse->on_st = false; >> >> +} >> >> + >> >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, >> >> + struct cfq_group *cfqg) >> >> { >> >> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> >> - struct cfq_group *cfqg = NULL; >> >> - void *key = cfqd; >> >> int i, j; >> >> struct cfq_rb_root *st; >> >> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> >> unsigned int major, minor; >> >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> >> >> >> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> >> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> >> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> >> - cfqg->blkg.dev = MKDEV(major, minor); >> >> - goto done; >> >> - } >> >> - if (cfqg || !create) >> >> - goto done; >> >> - >> >> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >> >> - if (!cfqg) >> >> - goto done; >> >> + cfqg->grp_service_tree = CFQ_RB_ROOT; >> >> >> >> for_each_cfqg_st(cfqg, i, j, st) >> >> *st = CFQ_RB_ROOT; >> >> - RB_CLEAR_NODE(&cfqg->rb_node); >> >> >> >> /* >> >> * Take the initial reference that will be released on destroy >> >> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> >> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> >> cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, >> >> MKDEV(major, minor)); >> >> - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >> >> - >> >> + init_group_queue_entity(blkcg, cfqg); >> >> /* Add group on cfqd list */ >> >> hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); >> >> +} >> >> + >> >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> +{ >> >> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) >> >> + cfq_destroy_cfqg(cfqd, cfqg); >> >> +} >> >> + >> >> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg) >> >> +{ >> >> + struct io_sched_entity *gse = &cfqg->group_se; >> >> + struct io_sched_entity *qse = &cfqg->queue_se; >> >> + struct io_sched_entity *p_gse = &p_cfqg->group_se; >> >> + >> >> + gse->parent = p_gse; >> >> + gse->st = &p_cfqg->grp_service_tree; >> >> + >> >> + qse->parent = gse; >> >> + qse->st = &cfqg->grp_service_tree; >> >> + >> >> + /* child reference */ >> >> + atomic_inc(&p_cfqg->ref); >> >> +} >> >> + >> >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) >> >> +{ >> >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> >> + struct blkio_cgroup *p_blkcg; >> >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> >> + unsigned int major, minor; >> >> + struct cfq_group *cfqg, *p_cfqg; >> >> + void *key = cfqd; >> >> + int ret; >> >> + >> >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> >> + if (cfqg) { >> >> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> >> + cfqg->blkg.dev = MKDEV(major, minor); >> >> + } >> >> + /* chain has already been built */ >> >> + return 0; >> >> + } >> >> + >> >> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >> >> + if (!cfqg) >> >> + return -1; >> >> + >> >> + init_cfqg(cfqd, blkcg, cfqg); >> >> + >> >> + /* Already to the top group */ >> >> + if (!cgroup->parent) >> >> + return 0; >> >> + >> >> + ret = cfqg_chain_alloc(cfqd, cgroup->parent); >> >> + if (ret == -1) { >> >> + uninit_cfqg(cfqd, cfqg); >> >> + return -1; >> >> + } >> >> + >> >> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); >> >> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key)); >> >> + BUG_ON(p_cfqg == NULL); >> >> + >> >> + cfqg_set_parent(cfqg, p_cfqg); >> >> + return 0; >> >> +} >> >> + >> >> +static struct cfq_group * >> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >> >> +{ >> >> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >> >> + struct cfq_group *cfqg = NULL; >> >> + void *key = cfqd; >> >> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >> >> + unsigned int major, minor; >> >> + int ret; >> >> >> >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> >> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >> >> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >> >> + cfqg->blkg.dev = MKDEV(major, minor); >> >> + goto done; >> >> + } >> >> + if (cfqg || !create) >> >> + goto done; >> >> + >> >> + ret = cfqg_chain_alloc(cfqd, cgroup); >> >> + if (!ret) { >> >> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >> >> + BUG_ON(cfqg == NULL); >> >> + goto done; >> >> + } >> >> done: >> >> return cfqg; >> >> } >> >> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) >> >> atomic_inc(&cfqq->cfqg->ref); >> >> } >> >> >> >> -static void cfq_put_cfqg(struct cfq_group *cfqg) >> >> -{ >> >> - struct cfq_rb_root *st; >> >> - int i, j; >> >> - >> >> - BUG_ON(atomic_read(&cfqg->ref) <= 0); >> >> - if (!atomic_dec_and_test(&cfqg->ref)) >> >> - return; >> >> - for_each_cfqg_st(cfqg, i, j, st) >> >> - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >> >> - kfree(cfqg); >> >> -} >> >> - >> >> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> -{ >> >> - /* Something wrong if we are trying to remove same group twice */ >> >> - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >> >> - >> >> - hlist_del_init(&cfqg->cfqd_node); >> >> - >> >> - /* >> >> - * Put the reference taken at the time of creation so that when all >> >> - * queues are gone, group can be destroyed. >> >> - */ >> >> - cfq_put_cfqg(cfqg); >> >> -} >> >> - >> >> static void cfq_release_cfq_groups(struct cfq_data *cfqd) >> >> { >> >> struct hlist_node *pos, *n; >> >> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, >> >> if (cfqq == cfqd->active_queue) >> >> cfqd->active_queue = NULL; >> >> >> >> - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) >> >> - cfqd->grp_service_tree.active = NULL; >> >> - >> >> if (cfqd->active_cic) { >> >> put_io_context(cfqd->active_cic->ioc); >> >> cfqd->active_cic = NULL; >> >> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) >> >> cfqd->noidle_tree_requires_idle = false; >> >> } >> >> >> >> + >> >> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) >> >> { >> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >> >> + struct cfq_group *root_group = &cfqd->root_group; >> >> + struct cfq_rb_root *st = &root_group->grp_service_tree; >> >> struct cfq_group *cfqg; >> >> + struct io_sched_entity *se; >> >> >> >> - if (RB_EMPTY_ROOT(&st->rb)) >> >> - return NULL; >> >> - cfqg = cfq_rb_first_group(st); >> >> - st->active = &cfqg->rb_node; >> >> - update_min_vdisktime(st); >> >> - return cfqg; >> >> + do { >> >> + se = cfq_rb_first_se(st); >> >> + if (!se) >> >> + return NULL; >> >> + st->active = &se->rb_node; >> >> + update_min_vdisktime(st); >> >> + cfqg = cfqg_of_qse(se); >> >> + if (cfqg) >> >> + return cfqg; >> >> + cfqg = cfqg_of_gse(se); >> >> + st = &cfqg->grp_service_tree; >> >> + } while (1); >> >> } >> >> >> >> static void cfq_choose_cfqg(struct cfq_data *cfqd) >> >> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> >> if (!cfqq) >> >> goto new_queue; >> >> >> >> + >> >> if (!cfqd->rq_queued) >> >> return NULL; >> >> >> >> + >> >> /* >> >> * We were waiting for group to get backlogged. Expire the queue >> >> */ >> >> if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) >> >> goto expire; >> >> >> >> + >> >> /* >> >> * The active queue has run out of time, expire it and select new. >> >> */ >> >> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> >> goto check_group_idle; >> >> } >> >> >> >> + >> >> /* >> >> * The active queue has requests and isn't expired, allow it to >> >> * dispatch. >> >> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >> >> if (!RB_EMPTY_ROOT(&cfqq->sort_list)) >> >> goto keep_queue; >> >> >> >> + >> >> /* >> >> * If another queue has a request waiting within our mean seek >> >> * distance, let it run. The expire code will check for close >> >> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) >> >> } >> >> >> >> cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); >> >> + >> >> return 1; >> >> } >> >> >> >> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q) >> >> >> >> cfqd->cic_index = i; >> >> >> >> - /* Init root service tree */ >> >> - cfqd->grp_service_tree = CFQ_RB_ROOT; >> >> - >> >> /* Init root group */ >> >> cfqg = &cfqd->root_group; >> >> + cfqg->grp_service_tree = CFQ_RB_ROOT; >> >> for_each_cfqg_st(cfqg, i, j, st) >> >> *st = CFQ_RB_ROOT; >> >> - RB_CLEAR_NODE(&cfqg->rb_node); >> >> - >> >> + cfqg->group_se.is_group_se = true; >> >> + RB_CLEAR_NODE(&cfqg->group_se.rb_node); >> >> + cfqg->group_se.on_st = false; >> >> /* Give preference to root group over other groups */ >> >> - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; >> >> + cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >> >> + cfqg->group_se.parent = NULL; >> >> + cfqg->group_se.st = NULL; >> >> + >> >> + cfqg->queue_se.is_group_se = false; >> >> + RB_CLEAR_NODE(&cfqg->queue_se.rb_node); >> >> + cfqg->queue_se.on_st = false; >> >> + cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >> >> + cfqg->queue_se.parent = &cfqg->group_se; >> >> + cfqg->queue_se.st = &cfqg->grp_service_tree; >> >> >> >> #ifdef CONFIG_CFQ_GROUP_IOSCHED >> >> /* >> >> -- >> >> 1.6.5.2 >> >> >> >> >> > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-31 15:40 ` Nauman Rafique @ 2010-08-31 19:25 ` Vivek Goyal 2010-09-01 8:50 ` Gui Jianfeng 0 siblings, 1 reply; 19+ messages in thread From: Vivek Goyal @ 2010-08-31 19:25 UTC (permalink / raw) To: Nauman Rafique Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: > On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: > >> Vivek Goyal wrote: > >> > On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > >> >> Hi All, > >> >> > >> >> This patch enables cfq group hierarchical scheduling. > >> >> > >> >> With this patch, you can create a cgroup directory deeper than level 1. > >> >> Now, I/O Bandwidth is distributed in a hierarchy way. For example: > >> >> We create cgroup directories as following(the number represents weight): > >> >> > >> >> Root grp > >> >> / \ > >> >> grp_1(100) grp_2(400) > >> >> / \ > >> >> grp_3(200) grp_4(300) > >> >> > >> >> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > >> >> grp_2 will share 80% of total bandwidth. > >> >> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > >> >> > >> >> Design: > >> >> o Each cfq group has its own group service tree. > >> >> o Each cfq group contains a "group schedule entity" (gse) that > >> >> schedules on parent cfq group's service tree. > >> >> o Each cfq group contains a "queue schedule entity"(qse), it > >> >> represents all cfqqs located on this cfq group. It schedules > >> >> on this group's service tree. For the time being, root group > >> >> qse's weight is 1000, and subgroup qse's weight is 500. > >> >> o All gses and qse which belones to a same cfq group schedules > >> >> on the same group service tree. > >> > > >> > Hi Gui, > >> > > >> > Thanks for the patch. I have few questions. > >> > > >> > - So how does the hierarchy look like, w.r.t root group. Something as > >> > follows? > >> > > >> > > >> > root > >> > / | \ > >> > q1 q2 G1 > >> > > >> > Assume there are two processes doin IO in root group and q1 and q2 are > >> > cfqq queues for those processes and G1 is the cgroup created by user. > >> > > >> > If yes, then what algorithm do you use to do scheduling between q1, q2 > >> > and G1? IOW, currently we have two algorithms operating in CFQ. One for > >> > cfqq and other for groups. Group algorithm does not use the logic of > >> > cfq_slice_offset(). > >> > >> Hi Vivek, > >> > >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. > >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and > >> it will schedule on root group service with G1, as following: > >> > >> root group > >> / \ > >> qse(q1,q2) gse(G1) > >> > > > > Ok. That's interesting. That raises another question that how hierarchy > > should look like. IOW, how queue and groups should be treated in > > hierarchy. > > > > CFS cpu scheduler treats queues and group at the same level. That is as > > follows. > > > > root > > / | \ > > q1 q2 G1 > > > > In the past I had raised this question and Jens and corrado liked treating > > queues and group at same level. > > > > Logically, q1, q2 and G1 are all children of root, so it makes sense to > > treat them at same level and not group q1 and q2 in to a single entity and > > group. > > > > One of the possible way forward could be this. > > > > - Treat queue and group at same level (like CFS) > > > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > > will be no ioprio difference between cfq queues. I think anyway as of > > today that logic helps in so little situations that I would not mind > > getting rid of it. Just that Jens should agree to it. > > > > - With this new scheme, it will break the existing semantics of root group > > being at same level as child groups. To avoid that, we can probably > > implement two modes (flat and hierarchical), something similar to what > > memory cgroup controller has done. May be one tunable in root cgroup of > > blkio "use_hierarchy". By default everything will be in flat mode and > > if user wants hiearchical control, he needs to set user_hierarchy in > > root group. > > Vivek, may be I am reading you wrong here. But you are first > suggesting to add more complexity to treat queues and group at the > same level. Then you are suggesting add even more complexity to fix > the problems caused by that approach. > > Why do we need to treat queues and group at the same level? "CFS does > it" is not a good argument. Sure it is not a very good argument but at the same time one would need a very good argument that why we should do things differently. - If a user has mounted cpu and blkio controller together and both the controllers are viewing the same hierarchy differently, then it is odd. We need a good reason that why different arrangement makes sense. - To me, both group and cfq queue are children of root group and it makes sense to treat them independent childrens instead of putting all the queues in one logical group which inherits the weight of parent. - With this new scheme, I am finding it hard to visualize the hierachy. How do you assign the weights to queue entities of a group. It is more like a invisible group with-in group. We shall have to create new tunable which can speicy the weight for this hidden group. So in summary I am liking the "queue at same level as group" scheme for two reasons. - It is more intutive to visualize and implement. It follows the true hierarchy as seen by cgroup file system. - CFS has already implemented this scheme. So we need a strong arguemnt to justify why we should not follow the same thing. Especially for the case where user has co-mounted cpu and blkio controller. - It can achieve the same goal as "hidden group" proposal just by creating a cgroup explicitly and moving all threads in that group. Why do you think that "hidden group" proposal is better than "treating queue at same level as group" ? Thanks Vivek ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-31 19:25 ` Vivek Goyal @ 2010-09-01 8:50 ` Gui Jianfeng 2010-09-01 15:49 ` Nauman Rafique 2010-09-02 2:20 ` Vivek Goyal 0 siblings, 2 replies; 19+ messages in thread From: Gui Jianfeng @ 2010-09-01 8:50 UTC (permalink / raw) To: Vivek Goyal Cc: Nauman Rafique, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki Vivek Goyal wrote: > On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: >> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: >>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >>>> Vivek Goyal wrote: >>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >>>>>> Hi All, >>>>>> >>>>>> This patch enables cfq group hierarchical scheduling. >>>>>> >>>>>> With this patch, you can create a cgroup directory deeper than level 1. >>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >>>>>> We create cgroup directories as following(the number represents weight): >>>>>> >>>>>> Root grp >>>>>> / \ >>>>>> grp_1(100) grp_2(400) >>>>>> / \ >>>>>> grp_3(200) grp_4(300) >>>>>> >>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >>>>>> grp_2 will share 80% of total bandwidth. >>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >>>>>> >>>>>> Design: >>>>>> o Each cfq group has its own group service tree. >>>>>> o Each cfq group contains a "group schedule entity" (gse) that >>>>>> schedules on parent cfq group's service tree. >>>>>> o Each cfq group contains a "queue schedule entity"(qse), it >>>>>> represents all cfqqs located on this cfq group. It schedules >>>>>> on this group's service tree. For the time being, root group >>>>>> qse's weight is 1000, and subgroup qse's weight is 500. >>>>>> o All gses and qse which belones to a same cfq group schedules >>>>>> on the same group service tree. >>>>> Hi Gui, >>>>> >>>>> Thanks for the patch. I have few questions. >>>>> >>>>> - So how does the hierarchy look like, w.r.t root group. Something as >>>>> follows? >>>>> >>>>> >>>>> root >>>>> / | \ >>>>> q1 q2 G1 >>>>> >>>>> Assume there are two processes doin IO in root group and q1 and q2 are >>>>> cfqq queues for those processes and G1 is the cgroup created by user. >>>>> >>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 >>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for >>>>> cfqq and other for groups. Group algorithm does not use the logic of >>>>> cfq_slice_offset(). >>>> Hi Vivek, >>>> >>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >>>> it will schedule on root group service with G1, as following: >>>> >>>> root group >>>> / \ >>>> qse(q1,q2) gse(G1) >>>> >>> Ok. That's interesting. That raises another question that how hierarchy >>> should look like. IOW, how queue and groups should be treated in >>> hierarchy. >>> >>> CFS cpu scheduler treats queues and group at the same level. That is as >>> follows. >>> >>> root >>> / | \ >>> q1 q2 G1 >>> >>> In the past I had raised this question and Jens and corrado liked treating >>> queues and group at same level. >>> >>> Logically, q1, q2 and G1 are all children of root, so it makes sense to >>> treat them at same level and not group q1 and q2 in to a single entity and >>> group. >>> >>> One of the possible way forward could be this. >>> >>> - Treat queue and group at same level (like CFS) >>> >>> - Get rid of cfq_slice_offset() logic. That means without idling on, there >>> will be no ioprio difference between cfq queues. I think anyway as of >>> today that logic helps in so little situations that I would not mind >>> getting rid of it. Just that Jens should agree to it. >>> >>> - With this new scheme, it will break the existing semantics of root group >>> being at same level as child groups. To avoid that, we can probably >>> implement two modes (flat and hierarchical), something similar to what >>> memory cgroup controller has done. May be one tunable in root cgroup of >>> blkio "use_hierarchy". By default everything will be in flat mode and >>> if user wants hiearchical control, he needs to set user_hierarchy in >>> root group. >> Vivek, may be I am reading you wrong here. But you are first >> suggesting to add more complexity to treat queues and group at the >> same level. Then you are suggesting add even more complexity to fix >> the problems caused by that approach. >> >> Why do we need to treat queues and group at the same level? "CFS does >> it" is not a good argument. > > Sure it is not a very good argument but at the same time one would need > a very good argument that why we should do things differently. > > - If a user has mounted cpu and blkio controller together and both the > controllers are viewing the same hierarchy differently, then it is > odd. We need a good reason that why different arrangement makes sense. Hi Vivek, Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio having their own logic, since they are totally different cgroup subsystems. > > - To me, both group and cfq queue are children of root group and it > makes sense to treat them independent childrens instead of putting > all the queues in one logical group which inherits the weight of > parent. > > - With this new scheme, I am finding it hard to visualize the hierachy. > How do you assign the weights to queue entities of a group. It is more > like a invisible group with-in group. We shall have to create new > tunable which can speicy the weight for this hidden group. For the time being, the root "qse" weight is 1000 and others is 500, they don't inherit the weight of parent. I was thinking that maybe we can determine the qse weight in term of the queue number and weight in this group and subgroups. Thanks, Gui > > > So in summary I am liking the "queue at same level as group" scheme for > two reasons. > > - It is more intutive to visualize and implement. It follows the true > hierarchy as seen by cgroup file system. > > - CFS has already implemented this scheme. So we need a strong arguemnt > to justify why we should not follow the same thing. Especially for > the case where user has co-mounted cpu and blkio controller. > > - It can achieve the same goal as "hidden group" proposal just by > creating a cgroup explicitly and moving all threads in that group. > > Why do you think that "hidden group" proposal is better than "treating > queue at same level as group" ? > > Thanks > Vivek > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 8:50 ` Gui Jianfeng @ 2010-09-01 15:49 ` Nauman Rafique 2010-09-01 17:10 ` Vivek Goyal 2010-09-02 2:20 ` Vivek Goyal 1 sibling, 1 reply; 19+ messages in thread From: Nauman Rafique @ 2010-09-01 15:49 UTC (permalink / raw) To: Gui Jianfeng Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: > Vivek Goyal wrote: >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >>>>> Vivek Goyal wrote: >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >>>>>>> Hi All, >>>>>>> >>>>>>> This patch enables cfq group hierarchical scheduling. >>>>>>> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1. >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >>>>>>> We create cgroup directories as following(the number represents weight): >>>>>>> >>>>>>> Root grp >>>>>>> / \ >>>>>>> grp_1(100) grp_2(400) >>>>>>> / \ >>>>>>> grp_3(200) grp_4(300) >>>>>>> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >>>>>>> grp_2 will share 80% of total bandwidth. >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >>>>>>> >>>>>>> Design: >>>>>>> o Each cfq group has its own group service tree. >>>>>>> o Each cfq group contains a "group schedule entity" (gse) that >>>>>>> schedules on parent cfq group's service tree. >>>>>>> o Each cfq group contains a "queue schedule entity"(qse), it >>>>>>> represents all cfqqs located on this cfq group. It schedules >>>>>>> on this group's service tree. For the time being, root group >>>>>>> qse's weight is 1000, and subgroup qse's weight is 500. >>>>>>> o All gses and qse which belones to a same cfq group schedules >>>>>>> on the same group service tree. >>>>>> Hi Gui, >>>>>> >>>>>> Thanks for the patch. I have few questions. >>>>>> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as >>>>>> follows? >>>>>> >>>>>> >>>>>> root >>>>>> / | \ >>>>>> q1 q2 G1 >>>>>> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are >>>>>> cfqq queues for those processes and G1 is the cgroup created by user. >>>>>> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for >>>>>> cfqq and other for groups. Group algorithm does not use the logic of >>>>>> cfq_slice_offset(). >>>>> Hi Vivek, >>>>> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >>>>> it will schedule on root group service with G1, as following: >>>>> >>>>> root group >>>>> / \ >>>>> qse(q1,q2) gse(G1) >>>>> >>>> Ok. That's interesting. That raises another question that how hierarchy >>>> should look like. IOW, how queue and groups should be treated in >>>> hierarchy. >>>> >>>> CFS cpu scheduler treats queues and group at the same level. That is as >>>> follows. >>>> >>>> root >>>> / | \ >>>> q1 q2 G1 >>>> >>>> In the past I had raised this question and Jens and corrado liked treating >>>> queues and group at same level. >>>> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to >>>> treat them at same level and not group q1 and q2 in to a single entity and >>>> group. >>>> >>>> One of the possible way forward could be this. >>>> >>>> - Treat queue and group at same level (like CFS) >>>> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there >>>> will be no ioprio difference between cfq queues. I think anyway as of >>>> today that logic helps in so little situations that I would not mind >>>> getting rid of it. Just that Jens should agree to it. >>>> >>>> - With this new scheme, it will break the existing semantics of root group >>>> being at same level as child groups. To avoid that, we can probably >>>> implement two modes (flat and hierarchical), something similar to what >>>> memory cgroup controller has done. May be one tunable in root cgroup of >>>> blkio "use_hierarchy". By default everything will be in flat mode and >>>> if user wants hiearchical control, he needs to set user_hierarchy in >>>> root group. >>> Vivek, may be I am reading you wrong here. But you are first >>> suggesting to add more complexity to treat queues and group at the >>> same level. Then you are suggesting add even more complexity to fix >>> the problems caused by that approach. >>> >>> Why do we need to treat queues and group at the same level? "CFS does >>> it" is not a good argument. >> >> Sure it is not a very good argument but at the same time one would need >> a very good argument that why we should do things differently. >> >> - If a user has mounted cpu and blkio controller together and both the >> controllers are viewing the same hierarchy differently, then it is >> odd. We need a good reason that why different arrangement makes sense. > > Hi Vivek, > > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio > having their own logic, since they are totally different cgroup subsystems. > >> >> - To me, both group and cfq queue are children of root group and it >> makes sense to treat them independent childrens instead of putting >> all the queues in one logical group which inherits the weight of >> parent. >> >> - With this new scheme, I am finding it hard to visualize the hierachy. >> How do you assign the weights to queue entities of a group. It is more >> like a invisible group with-in group. We shall have to create new >> tunable which can speicy the weight for this hidden group. > > For the time being, the root "qse" weight is 1000 and others is 500, they don't > inherit the weight of parent. I was thinking that maybe we can determine the qse > weight in term of the queue number and weight in this group and subgroups. > > Thanks, > Gui > >> >> >> So in summary I am liking the "queue at same level as group" scheme for >> two reasons. >> >> - It is more intutive to visualize and implement. It follows the true >> hierarchy as seen by cgroup file system. >> >> - CFS has already implemented this scheme. So we need a strong arguemnt >> to justify why we should not follow the same thing. Especially for >> the case where user has co-mounted cpu and blkio controller. >> >> - It can achieve the same goal as "hidden group" proposal just by >> creating a cgroup explicitly and moving all threads in that group. >> >> Why do you think that "hidden group" proposal is better than "treating >> queue at same level as group" ? There are multiple reasons for "hidden group" proposal being a better approach. - "Hidden group" would allow us to keep scheduling queues using the CFQ queue scheduling logic. And does not require any major changes in CFQ. Aren't we already using that approach to deal with queues at the root group? - If queues and groups are treated at the same level, queues can end up in root cgroup. And we cannot put an upper bound on the number of those queues. Those queues can consume system resources in proportion to their number, causing the performance of groups to suffer. If we have "hidden group", we can configure it to a small weight, and that would limit the impact these queues in root group can have. >> >> Thanks >> Vivek >> >> > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 15:49 ` Nauman Rafique @ 2010-09-01 17:10 ` Vivek Goyal 2010-09-01 17:15 ` Nauman Rafique 0 siblings, 1 reply; 19+ messages in thread From: Vivek Goyal @ 2010-09-01 17:10 UTC (permalink / raw) To: Nauman Rafique Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote: > On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: > > Vivek Goyal wrote: > >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: > >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: > >>>>> Vivek Goyal wrote: > >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > >>>>>>> Hi All, > >>>>>>> > >>>>>>> This patch enables cfq group hierarchical scheduling. > >>>>>>> > >>>>>>> With this patch, you can create a cgroup directory deeper than level 1. > >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: > >>>>>>> We create cgroup directories as following(the number represents weight): > >>>>>>> > >>>>>>> Root grp > >>>>>>> / \ > >>>>>>> grp_1(100) grp_2(400) > >>>>>>> / \ > >>>>>>> grp_3(200) grp_4(300) > >>>>>>> > >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > >>>>>>> grp_2 will share 80% of total bandwidth. > >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > >>>>>>> > >>>>>>> Design: > >>>>>>> o Each cfq group has its own group service tree. > >>>>>>> o Each cfq group contains a "group schedule entity" (gse) that > >>>>>>> schedules on parent cfq group's service tree. > >>>>>>> o Each cfq group contains a "queue schedule entity"(qse), it > >>>>>>> represents all cfqqs located on this cfq group. It schedules > >>>>>>> on this group's service tree. For the time being, root group > >>>>>>> qse's weight is 1000, and subgroup qse's weight is 500. > >>>>>>> o All gses and qse which belones to a same cfq group schedules > >>>>>>> on the same group service tree. > >>>>>> Hi Gui, > >>>>>> > >>>>>> Thanks for the patch. I have few questions. > >>>>>> > >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as > >>>>>> follows? > >>>>>> > >>>>>> > >>>>>> root > >>>>>> / | \ > >>>>>> q1 q2 G1 > >>>>>> > >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are > >>>>>> cfqq queues for those processes and G1 is the cgroup created by user. > >>>>>> > >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 > >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for > >>>>>> cfqq and other for groups. Group algorithm does not use the logic of > >>>>>> cfq_slice_offset(). > >>>>> Hi Vivek, > >>>>> > >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. > >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and > >>>>> it will schedule on root group service with G1, as following: > >>>>> > >>>>> root group > >>>>> / \ > >>>>> qse(q1,q2) gse(G1) > >>>>> > >>>> Ok. That's interesting. That raises another question that how hierarchy > >>>> should look like. IOW, how queue and groups should be treated in > >>>> hierarchy. > >>>> > >>>> CFS cpu scheduler treats queues and group at the same level. That is as > >>>> follows. > >>>> > >>>> root > >>>> / | \ > >>>> q1 q2 G1 > >>>> > >>>> In the past I had raised this question and Jens and corrado liked treating > >>>> queues and group at same level. > >>>> > >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to > >>>> treat them at same level and not group q1 and q2 in to a single entity and > >>>> group. > >>>> > >>>> One of the possible way forward could be this. > >>>> > >>>> - Treat queue and group at same level (like CFS) > >>>> > >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there > >>>> will be no ioprio difference between cfq queues. I think anyway as of > >>>> today that logic helps in so little situations that I would not mind > >>>> getting rid of it. Just that Jens should agree to it. > >>>> > >>>> - With this new scheme, it will break the existing semantics of root group > >>>> being at same level as child groups. To avoid that, we can probably > >>>> implement two modes (flat and hierarchical), something similar to what > >>>> memory cgroup controller has done. May be one tunable in root cgroup of > >>>> blkio "use_hierarchy". By default everything will be in flat mode and > >>>> if user wants hiearchical control, he needs to set user_hierarchy in > >>>> root group. > >>> Vivek, may be I am reading you wrong here. But you are first > >>> suggesting to add more complexity to treat queues and group at the > >>> same level. Then you are suggesting add even more complexity to fix > >>> the problems caused by that approach. > >>> > >>> Why do we need to treat queues and group at the same level? "CFS does > >>> it" is not a good argument. > >> > >> Sure it is not a very good argument but at the same time one would need > >> a very good argument that why we should do things differently. > >> > >> - If a user has mounted cpu and blkio controller together and both the > >> controllers are viewing the same hierarchy differently, then it is > >> odd. We need a good reason that why different arrangement makes sense. > > > > Hi Vivek, > > > > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio > > having their own logic, since they are totally different cgroup subsystems. > > > >> > >> - To me, both group and cfq queue are children of root group and it > >> makes sense to treat them independent childrens instead of putting > >> all the queues in one logical group which inherits the weight of > >> parent. > >> > >> - With this new scheme, I am finding it hard to visualize the hierachy. > >> How do you assign the weights to queue entities of a group. It is more > >> like a invisible group with-in group. We shall have to create new > >> tunable which can speicy the weight for this hidden group. > > > > For the time being, the root "qse" weight is 1000 and others is 500, they don't > > inherit the weight of parent. I was thinking that maybe we can determine the qse > > weight in term of the queue number and weight in this group and subgroups. > > > > Thanks, > > Gui > > > >> > >> > >> So in summary I am liking the "queue at same level as group" scheme for > >> two reasons. > >> > >> - It is more intutive to visualize and implement. It follows the true > >> hierarchy as seen by cgroup file system. > >> > >> - CFS has already implemented this scheme. So we need a strong arguemnt > >> to justify why we should not follow the same thing. Especially for > >> the case where user has co-mounted cpu and blkio controller. > >> > >> - It can achieve the same goal as "hidden group" proposal just by > >> creating a cgroup explicitly and moving all threads in that group. > >> > >> Why do you think that "hidden group" proposal is better than "treating > >> queue at same level as group" ? > > There are multiple reasons for "hidden group" proposal being a better approach. > > - "Hidden group" would allow us to keep scheduling queues using the > CFQ queue scheduling logic. And does not require any major changes in > CFQ. Aren't we already using that approach to deal with queues at the > root group? Currently we are operating in flat mode where all the groups are at same level (irrespective their position in cgroup hiearchy). > > - If queues and groups are treated at the same level, queues can end > up in root cgroup. And we cannot put an upper bound on the number of > those queues. Those queues can consume system resources in proportion > to their number, causing the performance of groups to suffer. If we > have "hidden group", we can configure it to a small weight, and that > would limit the impact these queues in root group can have. To limit the impact of other queues in cgroup, one can use libcgroup to automatically place new threads or tasks into a subgroup. I understand that kernel doing it by default should help though. It is less work in terms of configuration. But I am not sure that's a good argument to design kernel functionality. Kernel functionality should be pretty generic. Anyway, how would you assign the weight to the hidden group. What's the interface for that? A new cgroup file inside each cgroup? Personally I think that's little odd interface. Every group has one hidden group where all the queues in that group go and weight of that group can be specified by a cgroup file. But anyway, I am not tied to any of the approach. I am just trying to make sure that we have put enough thought into it as changing it later will be hard. Vivek ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 17:10 ` Vivek Goyal @ 2010-09-01 17:15 ` Nauman Rafique 2010-09-01 17:21 ` Vivek Goyal 2010-09-02 0:30 ` Gui Jianfeng 0 siblings, 2 replies; 19+ messages in thread From: Nauman Rafique @ 2010-09-01 17:15 UTC (permalink / raw) To: Vivek Goyal Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote: >> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: >> > Vivek Goyal wrote: >> >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: >> >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: >> >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >> >>>>> Vivek Goyal wrote: >> >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >> >>>>>>> Hi All, >> >>>>>>> >> >>>>>>> This patch enables cfq group hierarchical scheduling. >> >>>>>>> >> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1. >> >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >> >>>>>>> We create cgroup directories as following(the number represents weight): >> >>>>>>> >> >>>>>>> Root grp >> >>>>>>> / \ >> >>>>>>> grp_1(100) grp_2(400) >> >>>>>>> / \ >> >>>>>>> grp_3(200) grp_4(300) >> >>>>>>> >> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >> >>>>>>> grp_2 will share 80% of total bandwidth. >> >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >> >>>>>>> >> >>>>>>> Design: >> >>>>>>> o Each cfq group has its own group service tree. >> >>>>>>> o Each cfq group contains a "group schedule entity" (gse) that >> >>>>>>> schedules on parent cfq group's service tree. >> >>>>>>> o Each cfq group contains a "queue schedule entity"(qse), it >> >>>>>>> represents all cfqqs located on this cfq group. It schedules >> >>>>>>> on this group's service tree. For the time being, root group >> >>>>>>> qse's weight is 1000, and subgroup qse's weight is 500. >> >>>>>>> o All gses and qse which belones to a same cfq group schedules >> >>>>>>> on the same group service tree. >> >>>>>> Hi Gui, >> >>>>>> >> >>>>>> Thanks for the patch. I have few questions. >> >>>>>> >> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as >> >>>>>> follows? >> >>>>>> >> >>>>>> >> >>>>>> root >> >>>>>> / | \ >> >>>>>> q1 q2 G1 >> >>>>>> >> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are >> >>>>>> cfqq queues for those processes and G1 is the cgroup created by user. >> >>>>>> >> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 >> >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for >> >>>>>> cfqq and other for groups. Group algorithm does not use the logic of >> >>>>>> cfq_slice_offset(). >> >>>>> Hi Vivek, >> >>>>> >> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >> >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >> >>>>> it will schedule on root group service with G1, as following: >> >>>>> >> >>>>> root group >> >>>>> / \ >> >>>>> qse(q1,q2) gse(G1) >> >>>>> >> >>>> Ok. That's interesting. That raises another question that how hierarchy >> >>>> should look like. IOW, how queue and groups should be treated in >> >>>> hierarchy. >> >>>> >> >>>> CFS cpu scheduler treats queues and group at the same level. That is as >> >>>> follows. >> >>>> >> >>>> root >> >>>> / | \ >> >>>> q1 q2 G1 >> >>>> >> >>>> In the past I had raised this question and Jens and corrado liked treating >> >>>> queues and group at same level. >> >>>> >> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to >> >>>> treat them at same level and not group q1 and q2 in to a single entity and >> >>>> group. >> >>>> >> >>>> One of the possible way forward could be this. >> >>>> >> >>>> - Treat queue and group at same level (like CFS) >> >>>> >> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there >> >>>> will be no ioprio difference between cfq queues. I think anyway as of >> >>>> today that logic helps in so little situations that I would not mind >> >>>> getting rid of it. Just that Jens should agree to it. >> >>>> >> >>>> - With this new scheme, it will break the existing semantics of root group >> >>>> being at same level as child groups. To avoid that, we can probably >> >>>> implement two modes (flat and hierarchical), something similar to what >> >>>> memory cgroup controller has done. May be one tunable in root cgroup of >> >>>> blkio "use_hierarchy". By default everything will be in flat mode and >> >>>> if user wants hiearchical control, he needs to set user_hierarchy in >> >>>> root group. >> >>> Vivek, may be I am reading you wrong here. But you are first >> >>> suggesting to add more complexity to treat queues and group at the >> >>> same level. Then you are suggesting add even more complexity to fix >> >>> the problems caused by that approach. >> >>> >> >>> Why do we need to treat queues and group at the same level? "CFS does >> >>> it" is not a good argument. >> >> >> >> Sure it is not a very good argument but at the same time one would need >> >> a very good argument that why we should do things differently. >> >> >> >> - If a user has mounted cpu and blkio controller together and both the >> >> controllers are viewing the same hierarchy differently, then it is >> >> odd. We need a good reason that why different arrangement makes sense. >> > >> > Hi Vivek, >> > >> > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio >> > having their own logic, since they are totally different cgroup subsystems. >> > >> >> >> >> - To me, both group and cfq queue are children of root group and it >> >> makes sense to treat them independent childrens instead of putting >> >> all the queues in one logical group which inherits the weight of >> >> parent. >> >> >> >> - With this new scheme, I am finding it hard to visualize the hierachy. >> >> How do you assign the weights to queue entities of a group. It is more >> >> like a invisible group with-in group. We shall have to create new >> >> tunable which can speicy the weight for this hidden group. >> > >> > For the time being, the root "qse" weight is 1000 and others is 500, they don't >> > inherit the weight of parent. I was thinking that maybe we can determine the qse >> > weight in term of the queue number and weight in this group and subgroups. >> > >> > Thanks, >> > Gui >> > >> >> >> >> >> >> So in summary I am liking the "queue at same level as group" scheme for >> >> two reasons. >> >> >> >> - It is more intutive to visualize and implement. It follows the true >> >> hierarchy as seen by cgroup file system. >> >> >> >> - CFS has already implemented this scheme. So we need a strong arguemnt >> >> to justify why we should not follow the same thing. Especially for >> >> the case where user has co-mounted cpu and blkio controller. >> >> >> >> - It can achieve the same goal as "hidden group" proposal just by >> >> creating a cgroup explicitly and moving all threads in that group. >> >> >> >> Why do you think that "hidden group" proposal is better than "treating >> >> queue at same level as group" ? >> >> There are multiple reasons for "hidden group" proposal being a better approach. >> >> - "Hidden group" would allow us to keep scheduling queues using the >> CFQ queue scheduling logic. And does not require any major changes in >> CFQ. Aren't we already using that approach to deal with queues at the >> root group? > > Currently we are operating in flat mode where all the groups are at > same level (irrespective their position in cgroup hiearchy). > >> >> - If queues and groups are treated at the same level, queues can end >> up in root cgroup. And we cannot put an upper bound on the number of >> those queues. Those queues can consume system resources in proportion >> to their number, causing the performance of groups to suffer. If we >> have "hidden group", we can configure it to a small weight, and that >> would limit the impact these queues in root group can have. > > To limit the impact of other queues in cgroup, one can use libcgroup to > automatically place new threads or tasks into a subgroup. > > I understand that kernel doing it by default should help though. It is > less work in terms of configuration. But I am not sure that's a good > argument to design kernel functionality. Kernel functionality should be > pretty generic. > > Anyway, how would you assign the weight to the hidden group. What's the > interface for that? A new cgroup file inside each cgroup? Personally > I think that's little odd interface. Every group has one hidden group > where all the queues in that group go and weight of that group can be > specified by a cgroup file. I think picking a reasonable default weight at compile time is not that bad an option, given that threads showing up in the "hidden group" is an uncommon case. > > But anyway, I am not tied to any of the approach. I am just trying to > make sure that we have put enough thought into it as changing it later > will be hard. > > Vivek > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 17:15 ` Nauman Rafique @ 2010-09-01 17:21 ` Vivek Goyal 2010-09-02 0:30 ` Gui Jianfeng 1 sibling, 0 replies; 19+ messages in thread From: Vivek Goyal @ 2010-09-01 17:21 UTC (permalink / raw) To: Nauman Rafique Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki On Wed, Sep 01, 2010 at 10:15:31AM -0700, Nauman Rafique wrote: > On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > > On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote: > >> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: > >> > Vivek Goyal wrote: > >> >> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: > >> >>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: > >> >>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: > >> >>>>> Vivek Goyal wrote: > >> >>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > >> >>>>>>> Hi All, > >> >>>>>>> > >> >>>>>>> This patch enables cfq group hierarchical scheduling. > >> >>>>>>> > >> >>>>>>> With this patch, you can create a cgroup directory deeper than level 1. > >> >>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: > >> >>>>>>> We create cgroup directories as following(the number represents weight): > >> >>>>>>> > >> >>>>>>> Root grp > >> >>>>>>> / \ > >> >>>>>>> grp_1(100) grp_2(400) > >> >>>>>>> / \ > >> >>>>>>> grp_3(200) grp_4(300) > >> >>>>>>> > >> >>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > >> >>>>>>> grp_2 will share 80% of total bandwidth. > >> >>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > >> >>>>>>> > >> >>>>>>> Design: > >> >>>>>>> o Each cfq group has its own group service tree. > >> >>>>>>> o Each cfq group contains a "group schedule entity" (gse) that > >> >>>>>>> schedules on parent cfq group's service tree. > >> >>>>>>> o Each cfq group contains a "queue schedule entity"(qse), it > >> >>>>>>> represents all cfqqs located on this cfq group. It schedules > >> >>>>>>> on this group's service tree. For the time being, root group > >> >>>>>>> qse's weight is 1000, and subgroup qse's weight is 500. > >> >>>>>>> o All gses and qse which belones to a same cfq group schedules > >> >>>>>>> on the same group service tree. > >> >>>>>> Hi Gui, > >> >>>>>> > >> >>>>>> Thanks for the patch. I have few questions. > >> >>>>>> > >> >>>>>> - So how does the hierarchy look like, w.r.t root group. Something as > >> >>>>>> follows? > >> >>>>>> > >> >>>>>> > >> >>>>>> root > >> >>>>>> / | \ > >> >>>>>> q1 q2 G1 > >> >>>>>> > >> >>>>>> Assume there are two processes doin IO in root group and q1 and q2 are > >> >>>>>> cfqq queues for those processes and G1 is the cgroup created by user. > >> >>>>>> > >> >>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 > >> >>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for > >> >>>>>> cfqq and other for groups. Group algorithm does not use the logic of > >> >>>>>> cfq_slice_offset(). > >> >>>>> Hi Vivek, > >> >>>>> > >> >>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. > >> >>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and > >> >>>>> it will schedule on root group service with G1, as following: > >> >>>>> > >> >>>>> root group > >> >>>>> / \ > >> >>>>> qse(q1,q2) gse(G1) > >> >>>>> > >> >>>> Ok. That's interesting. That raises another question that how hierarchy > >> >>>> should look like. IOW, how queue and groups should be treated in > >> >>>> hierarchy. > >> >>>> > >> >>>> CFS cpu scheduler treats queues and group at the same level. That is as > >> >>>> follows. > >> >>>> > >> >>>> root > >> >>>> / | \ > >> >>>> q1 q2 G1 > >> >>>> > >> >>>> In the past I had raised this question and Jens and corrado liked treating > >> >>>> queues and group at same level. > >> >>>> > >> >>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to > >> >>>> treat them at same level and not group q1 and q2 in to a single entity and > >> >>>> group. > >> >>>> > >> >>>> One of the possible way forward could be this. > >> >>>> > >> >>>> - Treat queue and group at same level (like CFS) > >> >>>> > >> >>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there > >> >>>> will be no ioprio difference between cfq queues. I think anyway as of > >> >>>> today that logic helps in so little situations that I would not mind > >> >>>> getting rid of it. Just that Jens should agree to it. > >> >>>> > >> >>>> - With this new scheme, it will break the existing semantics of root group > >> >>>> being at same level as child groups. To avoid that, we can probably > >> >>>> implement two modes (flat and hierarchical), something similar to what > >> >>>> memory cgroup controller has done. May be one tunable in root cgroup of > >> >>>> blkio "use_hierarchy". By default everything will be in flat mode and > >> >>>> if user wants hiearchical control, he needs to set user_hierarchy in > >> >>>> root group. > >> >>> Vivek, may be I am reading you wrong here. But you are first > >> >>> suggesting to add more complexity to treat queues and group at the > >> >>> same level. Then you are suggesting add even more complexity to fix > >> >>> the problems caused by that approach. > >> >>> > >> >>> Why do we need to treat queues and group at the same level? "CFS does > >> >>> it" is not a good argument. > >> >> > >> >> Sure it is not a very good argument but at the same time one would need > >> >> a very good argument that why we should do things differently. > >> >> > >> >> - If a user has mounted cpu and blkio controller together and both the > >> >> controllers are viewing the same hierarchy differently, then it is > >> >> odd. We need a good reason that why different arrangement makes sense. > >> > > >> > Hi Vivek, > >> > > >> > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio > >> > having their own logic, since they are totally different cgroup subsystems. > >> > > >> >> > >> >> - To me, both group and cfq queue are children of root group and it > >> >> makes sense to treat them independent childrens instead of putting > >> >> all the queues in one logical group which inherits the weight of > >> >> parent. > >> >> > >> >> - With this new scheme, I am finding it hard to visualize the hierachy. > >> >> How do you assign the weights to queue entities of a group. It is more > >> >> like a invisible group with-in group. We shall have to create new > >> >> tunable which can speicy the weight for this hidden group. > >> > > >> > For the time being, the root "qse" weight is 1000 and others is 500, they don't > >> > inherit the weight of parent. I was thinking that maybe we can determine the qse > >> > weight in term of the queue number and weight in this group and subgroups. > >> > > >> > Thanks, > >> > Gui > >> > > >> >> > >> >> > >> >> So in summary I am liking the "queue at same level as group" scheme for > >> >> two reasons. > >> >> > >> >> - It is more intutive to visualize and implement. It follows the true > >> >> hierarchy as seen by cgroup file system. > >> >> > >> >> - CFS has already implemented this scheme. So we need a strong arguemnt > >> >> to justify why we should not follow the same thing. Especially for > >> >> the case where user has co-mounted cpu and blkio controller. > >> >> > >> >> - It can achieve the same goal as "hidden group" proposal just by > >> >> creating a cgroup explicitly and moving all threads in that group. > >> >> > >> >> Why do you think that "hidden group" proposal is better than "treating > >> >> queue at same level as group" ? > >> > >> There are multiple reasons for "hidden group" proposal being a better approach. > >> > >> - "Hidden group" would allow us to keep scheduling queues using the > >> CFQ queue scheduling logic. And does not require any major changes in > >> CFQ. Aren't we already using that approach to deal with queues at the > >> root group? > > > > Currently we are operating in flat mode where all the groups are at > > same level (irrespective their position in cgroup hiearchy). > > > >> > >> - If queues and groups are treated at the same level, queues can end > >> up in root cgroup. And we cannot put an upper bound on the number of > >> those queues. Those queues can consume system resources in proportion > >> to their number, causing the performance of groups to suffer. If we > >> have "hidden group", we can configure it to a small weight, and that > >> would limit the impact these queues in root group can have. > > > > To limit the impact of other queues in cgroup, one can use libcgroup to > > automatically place new threads or tasks into a subgroup. > > > > I understand that kernel doing it by default should help though. It is > > less work in terms of configuration. But I am not sure that's a good > > argument to design kernel functionality. Kernel functionality should be > > pretty generic. > > > > Anyway, how would you assign the weight to the hidden group. What's the > > interface for that? A new cgroup file inside each cgroup? Personally > > I think that's little odd interface. Every group has one hidden group > > where all the queues in that group go and weight of that group can be > > specified by a cgroup file. > > I think picking a reasonable default weight at compile time is not > that bad an option, given that threads showing up in the "hidden > group" is an uncommon case. I don't think that's a good idea. A user should be able to determine what's the share of the queues in a group without looking at the kernel config file. Vivek ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 17:15 ` Nauman Rafique 2010-09-01 17:21 ` Vivek Goyal @ 2010-09-02 0:30 ` Gui Jianfeng 1 sibling, 0 replies; 19+ messages in thread From: Gui Jianfeng @ 2010-09-02 0:30 UTC (permalink / raw) To: Nauman Rafique Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki Nauman Rafique wrote: > On Wed, Sep 1, 2010 at 10:10 AM, Vivek Goyal <vgoyal@redhat.com> wrote: >> On Wed, Sep 01, 2010 at 08:49:26AM -0700, Nauman Rafique wrote: >>> On Wed, Sep 1, 2010 at 1:50 AM, Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: >>>> Vivek Goyal wrote: >>>>> On Tue, Aug 31, 2010 at 08:40:19AM -0700, Nauman Rafique wrote: >>>>>> On Tue, Aug 31, 2010 at 5:57 AM, Vivek Goyal <vgoyal@redhat.com> wrote: >>>>>>> On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >>>>>>>> Vivek Goyal wrote: >>>>>>>>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >>>>>>>>>> Hi All, >>>>>>>>>> >>>>>>>>>> This patch enables cfq group hierarchical scheduling. >>>>>>>>>> >>>>>>>>>> With this patch, you can create a cgroup directory deeper than level 1. >>>>>>>>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >>>>>>>>>> We create cgroup directories as following(the number represents weight): >>>>>>>>>> >>>>>>>>>> Root grp >>>>>>>>>> / \ >>>>>>>>>> grp_1(100) grp_2(400) >>>>>>>>>> / \ >>>>>>>>>> grp_3(200) grp_4(300) >>>>>>>>>> >>>>>>>>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >>>>>>>>>> grp_2 will share 80% of total bandwidth. >>>>>>>>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >>>>>>>>>> >>>>>>>>>> Design: >>>>>>>>>> o Each cfq group has its own group service tree. >>>>>>>>>> o Each cfq group contains a "group schedule entity" (gse) that >>>>>>>>>> schedules on parent cfq group's service tree. >>>>>>>>>> o Each cfq group contains a "queue schedule entity"(qse), it >>>>>>>>>> represents all cfqqs located on this cfq group. It schedules >>>>>>>>>> on this group's service tree. For the time being, root group >>>>>>>>>> qse's weight is 1000, and subgroup qse's weight is 500. >>>>>>>>>> o All gses and qse which belones to a same cfq group schedules >>>>>>>>>> on the same group service tree. >>>>>>>>> Hi Gui, >>>>>>>>> >>>>>>>>> Thanks for the patch. I have few questions. >>>>>>>>> >>>>>>>>> - So how does the hierarchy look like, w.r.t root group. Something as >>>>>>>>> follows? >>>>>>>>> >>>>>>>>> >>>>>>>>> root >>>>>>>>> / | \ >>>>>>>>> q1 q2 G1 >>>>>>>>> >>>>>>>>> Assume there are two processes doin IO in root group and q1 and q2 are >>>>>>>>> cfqq queues for those processes and G1 is the cgroup created by user. >>>>>>>>> >>>>>>>>> If yes, then what algorithm do you use to do scheduling between q1, q2 >>>>>>>>> and G1? IOW, currently we have two algorithms operating in CFQ. One for >>>>>>>>> cfqq and other for groups. Group algorithm does not use the logic of >>>>>>>>> cfq_slice_offset(). >>>>>>>> Hi Vivek, >>>>>>>> >>>>>>>> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >>>>>>>> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >>>>>>>> it will schedule on root group service with G1, as following: >>>>>>>> >>>>>>>> root group >>>>>>>> / \ >>>>>>>> qse(q1,q2) gse(G1) >>>>>>>> >>>>>>> Ok. That's interesting. That raises another question that how hierarchy >>>>>>> should look like. IOW, how queue and groups should be treated in >>>>>>> hierarchy. >>>>>>> >>>>>>> CFS cpu scheduler treats queues and group at the same level. That is as >>>>>>> follows. >>>>>>> >>>>>>> root >>>>>>> / | \ >>>>>>> q1 q2 G1 >>>>>>> >>>>>>> In the past I had raised this question and Jens and corrado liked treating >>>>>>> queues and group at same level. >>>>>>> >>>>>>> Logically, q1, q2 and G1 are all children of root, so it makes sense to >>>>>>> treat them at same level and not group q1 and q2 in to a single entity and >>>>>>> group. >>>>>>> >>>>>>> One of the possible way forward could be this. >>>>>>> >>>>>>> - Treat queue and group at same level (like CFS) >>>>>>> >>>>>>> - Get rid of cfq_slice_offset() logic. That means without idling on, there >>>>>>> will be no ioprio difference between cfq queues. I think anyway as of >>>>>>> today that logic helps in so little situations that I would not mind >>>>>>> getting rid of it. Just that Jens should agree to it. >>>>>>> >>>>>>> - With this new scheme, it will break the existing semantics of root group >>>>>>> being at same level as child groups. To avoid that, we can probably >>>>>>> implement two modes (flat and hierarchical), something similar to what >>>>>>> memory cgroup controller has done. May be one tunable in root cgroup of >>>>>>> blkio "use_hierarchy". By default everything will be in flat mode and >>>>>>> if user wants hiearchical control, he needs to set user_hierarchy in >>>>>>> root group. >>>>>> Vivek, may be I am reading you wrong here. But you are first >>>>>> suggesting to add more complexity to treat queues and group at the >>>>>> same level. Then you are suggesting add even more complexity to fix >>>>>> the problems caused by that approach. >>>>>> >>>>>> Why do we need to treat queues and group at the same level? "CFS does >>>>>> it" is not a good argument. >>>>> Sure it is not a very good argument but at the same time one would need >>>>> a very good argument that why we should do things differently. >>>>> >>>>> - If a user has mounted cpu and blkio controller together and both the >>>>> controllers are viewing the same hierarchy differently, then it is >>>>> odd. We need a good reason that why different arrangement makes sense. >>>> Hi Vivek, >>>> >>>> Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio >>>> having their own logic, since they are totally different cgroup subsystems. >>>> >>>>> - To me, both group and cfq queue are children of root group and it >>>>> makes sense to treat them independent childrens instead of putting >>>>> all the queues in one logical group which inherits the weight of >>>>> parent. >>>>> >>>>> - With this new scheme, I am finding it hard to visualize the hierachy. >>>>> How do you assign the weights to queue entities of a group. It is more >>>>> like a invisible group with-in group. We shall have to create new >>>>> tunable which can speicy the weight for this hidden group. >>>> For the time being, the root "qse" weight is 1000 and others is 500, they don't >>>> inherit the weight of parent. I was thinking that maybe we can determine the qse >>>> weight in term of the queue number and weight in this group and subgroups. >>>> >>>> Thanks, >>>> Gui >>>> >>>>> >>>>> So in summary I am liking the "queue at same level as group" scheme for >>>>> two reasons. >>>>> >>>>> - It is more intutive to visualize and implement. It follows the true >>>>> hierarchy as seen by cgroup file system. >>>>> >>>>> - CFS has already implemented this scheme. So we need a strong arguemnt >>>>> to justify why we should not follow the same thing. Especially for >>>>> the case where user has co-mounted cpu and blkio controller. >>>>> >>>>> - It can achieve the same goal as "hidden group" proposal just by >>>>> creating a cgroup explicitly and moving all threads in that group. >>>>> >>>>> Why do you think that "hidden group" proposal is better than "treating >>>>> queue at same level as group" ? >>> There are multiple reasons for "hidden group" proposal being a better approach. >>> >>> - "Hidden group" would allow us to keep scheduling queues using the >>> CFQ queue scheduling logic. And does not require any major changes in >>> CFQ. Aren't we already using that approach to deal with queues at the >>> root group? >> Currently we are operating in flat mode where all the groups are at >> same level (irrespective their position in cgroup hiearchy). >> >>> - If queues and groups are treated at the same level, queues can end >>> up in root cgroup. And we cannot put an upper bound on the number of >>> those queues. Those queues can consume system resources in proportion >>> to their number, causing the performance of groups to suffer. If we >>> have "hidden group", we can configure it to a small weight, and that >>> would limit the impact these queues in root group can have. >> To limit the impact of other queues in cgroup, one can use libcgroup to >> automatically place new threads or tasks into a subgroup. >> >> I understand that kernel doing it by default should help though. It is >> less work in terms of configuration. But I am not sure that's a good >> argument to design kernel functionality. Kernel functionality should be >> pretty generic. >> >> Anyway, how would you assign the weight to the hidden group. What's the >> interface for that? A new cgroup file inside each cgroup? Personally >> I think that's little odd interface. Every group has one hidden group >> where all the queues in that group go and weight of that group can be >> specified by a cgroup file. > > I think picking a reasonable default weight at compile time is not > that bad an option, given that threads showing up in the "hidden > group" is an uncommon case. Hi Nauman, Later, I think we might adjust the weight of "hidden group" automatically according to the queue number and subgroup number and their weight. But for the time being, i'd choose a fixed value for the sake of simplicity. Gui > >> But anyway, I am not tied to any of the approach. I am just trying to >> make sure that we have put enough thought into it as changing it later >> will be hard. >> >> Vivek >> > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 8:50 ` Gui Jianfeng 2010-09-01 15:49 ` Nauman Rafique @ 2010-09-02 2:20 ` Vivek Goyal 1 sibling, 0 replies; 19+ messages in thread From: Vivek Goyal @ 2010-09-02 2:20 UTC (permalink / raw) To: Gui Jianfeng Cc: Nauman Rafique, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, linux kernel mailing list, KAMEZAWA Hiroyuki On Wed, Sep 01, 2010 at 04:50:22PM +0800, Gui Jianfeng wrote: [..] > Hi Vivek, > > Even if we mount cpu and blkio together, to me, it's ok for cpu and blkio > having their own logic, since they are totally different cgroup subsystems. > > > > > - To me, both group and cfq queue are children of root group and it > > makes sense to treat them independent childrens instead of putting > > all the queues in one logical group which inherits the weight of > > parent. > > > > - With this new scheme, I am finding it hard to visualize the hierachy. > > How do you assign the weights to queue entities of a group. It is more > > like a invisible group with-in group. We shall have to create new > > tunable which can speicy the weight for this hidden group. > > For the time being, the root "qse" weight is 1000 and others is 500, they don't > inherit the weight of parent. I was thinking that maybe we can determine the qse > weight in term of the queue number and weight in this group and subgroups. If you decide queue weight in terms of queue number, then you are back to the same problem Nauman was mentioning that share of a group is not fixed and depends on number of queues in the group. Vivek ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-08-31 12:57 ` Vivek Goyal 2010-08-31 15:40 ` Nauman Rafique @ 2010-09-01 8:48 ` Gui Jianfeng 2010-09-01 9:02 ` KAMEZAWA Hiroyuki 1 sibling, 1 reply; 19+ messages in thread From: Gui Jianfeng @ 2010-09-01 8:48 UTC (permalink / raw) To: Vivek Goyal, KAMEZAWA Hiroyuki Cc: Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list Add Kamezawa Vivek Goyal wrote: > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: >> Vivek Goyal wrote: >>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: >>>> Hi All, >>>> >>>> This patch enables cfq group hierarchical scheduling. >>>> >>>> With this patch, you can create a cgroup directory deeper than level 1. >>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: >>>> We create cgroup directories as following(the number represents weight): >>>> >>>> Root grp >>>> / \ >>>> grp_1(100) grp_2(400) >>>> / \ >>>> grp_3(200) grp_4(300) >>>> >>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, >>>> grp_2 will share 80% of total bandwidth. >>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) >>>> >>>> Design: >>>> o Each cfq group has its own group service tree. >>>> o Each cfq group contains a "group schedule entity" (gse) that >>>> schedules on parent cfq group's service tree. >>>> o Each cfq group contains a "queue schedule entity"(qse), it >>>> represents all cfqqs located on this cfq group. It schedules >>>> on this group's service tree. For the time being, root group >>>> qse's weight is 1000, and subgroup qse's weight is 500. >>>> o All gses and qse which belones to a same cfq group schedules >>>> on the same group service tree. >>> Hi Gui, >>> >>> Thanks for the patch. I have few questions. >>> >>> - So how does the hierarchy look like, w.r.t root group. Something as >>> follows? >>> >>> >>> root >>> / | \ >>> q1 q2 G1 >>> >>> Assume there are two processes doin IO in root group and q1 and q2 are >>> cfqq queues for those processes and G1 is the cgroup created by user. >>> >>> If yes, then what algorithm do you use to do scheduling between q1, q2 >>> and G1? IOW, currently we have two algorithms operating in CFQ. One for >>> cfqq and other for groups. Group algorithm does not use the logic of >>> cfq_slice_offset(). >> Hi Vivek, >> >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and >> it will schedule on root group service with G1, as following: >> >> root group >> / \ >> qse(q1,q2) gse(G1) >> > > Ok. That's interesting. That raises another question that how hierarchy > should look like. IOW, how queue and groups should be treated in > hierarchy. > > CFS cpu scheduler treats queues and group at the same level. That is as > follows. > > root > / | \ > q1 q2 G1 > > In the past I had raised this question and Jens and corrado liked treating > queues and group at same level. > > Logically, q1, q2 and G1 are all children of root, so it makes sense to > treat them at same level and not group q1 and q2 in to a single entity and > group. Hi Vivek, IMO, this new approach keeps the original scheduling logic, and keep things simple. And to me, this approach works for me. So i choose it. > > One of the possible way forward could be this. > > - Treat queue and group at same level (like CFS) > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > will be no ioprio difference between cfq queues. I think anyway as of > today that logic helps in so little situations that I would not mind > getting rid of it. Just that Jens should agree to it. > > - With this new scheme, it will break the existing semantics of root group > being at same level as child groups. To avoid that, we can probably > implement two modes (flat and hierarchical), something similar to what > memory cgroup controller has done. May be one tunable in root cgroup of > blkio "use_hierarchy". By default everything will be in flat mode and > if user wants hiearchical control, he needs to set user_hierarchy in > root group. > > I think memory controller provides "use_hierarchy" tunable in each > cgroup. I am not sure why do we need it in each cgroup and not just > in root cgroup. I think Kamezawa-san should be able to answer this question. :) Thanks Gui > > Thanks > Vivek > >> Thanks, >> Gui >> >>> The reason being they are different because CFQ wanted to provide ioprio >>> service differentiation on SSD where slice idle is effectively zero. But >>> that slice_offset() logic is approximate at best and is not suitable >>> for group scheduling. >>> >>> I have yet to go through the patches in detail, so thought will ask you >>> that how have you handeled above. >>> >>> Vivek >>> >>>> o cfq group allocates in a recursive manner, that means when a cfq >>>> group needs to be allocated, the upper level cfq groups are also >>>> allocated. >>>> o When a cfq group served, not only charge this cfq group but also >>>> charge its ancestors. >>>> block/cfq-iosched.c | 477 +++++++++++++++++++++++++++++++++++++------------- >>>> 2 files changed, 353 insertions(+), 128 deletions(-) >>>> >>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c >>>> index 2fef1ef..aa06cb3 100644 >>>> --- a/block/blk-cgroup.c >>>> +++ b/block/blk-cgroup.c >>>> @@ -964,10 +964,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) >>>> goto done; >>>> } >>>> >>>> - /* Currently we do not support hierarchy deeper than two level (0,1) */ >>>> - if (parent != cgroup->top_cgroup) >>>> - return ERR_PTR(-EPERM); >>>> - >>>> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); >>>> if (!blkcg) >>>> return ERR_PTR(-ENOMEM); >>>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c >>>> index f65c6f0..4660d89 100644 >>>> --- a/block/cfq-iosched.c >>>> +++ b/block/cfq-iosched.c >>>> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida); >>>> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) >>>> >>>> #define sample_valid(samples) ((samples) > 80) >>>> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) >>>> +#define rb_entry_se(node) \ >>>> + rb_entry((node), struct io_sched_entity, rb_node) >>>> >>>> /* >>>> * Most of our rbtree usage is for sorting with min extraction, so >>>> @@ -171,19 +172,40 @@ enum wl_type_t { >>>> SYNC_WORKLOAD = 2 >>>> }; >>>> >>>> -/* This is per cgroup per device grouping structure */ >>>> -struct cfq_group { >>>> +/* >>>> + * This's the schedule entity which is scheduled on group service tree. >>>> + * It represents shedule structure of a cfq group, or a bundle of cfq >>>> + * queues that locate in a cfq group. >>>> + */ >>>> +struct io_sched_entity { >>>> + struct cfq_rb_root *st; >>>> + >>>> /* group service_tree member */ >>>> struct rb_node rb_node; >>>> - >>>> - /* group service_tree key */ >>>> u64 vdisktime; >>>> unsigned int weight; >>>> bool on_st; >>>> + bool is_group_se; >>>> + struct io_sched_entity *parent; >>>> +}; >>>> + >>>> +/* This is per cgroup per device grouping structure */ >>>> +struct cfq_group { >>>> + /* cfq group sched entity */ >>>> + struct io_sched_entity group_se; >>>> + >>>> + /* sched entity for cfqqs */ >>>> + struct io_sched_entity queue_se; >>>> + >>>> + /* Service tree for cfq_groups and cfqqs set*/ >>>> + struct cfq_rb_root grp_service_tree; >>>> >>>> /* number of cfqq currently on this group */ >>>> int nr_cfqq; >>>> >>>> + /* number of sub cfq groups */ >>>> + int nr_subgp; >>>> + >>>> /* Per group busy queus average. Useful for workload slice calc. */ >>>> unsigned int busy_queues_avg[2]; >>>> /* >>>> @@ -210,8 +232,6 @@ struct cfq_group { >>>> */ >>>> struct cfq_data { >>>> struct request_queue *queue; >>>> - /* Root service tree for cfq_groups */ >>>> - struct cfq_rb_root grp_service_tree; >>>> struct cfq_group root_group; >>>> >>>> /* >>>> @@ -399,6 +419,22 @@ static inline bool iops_mode(struct cfq_data *cfqd) >>>> return false; >>>> } >>>> >>>> +static inline struct cfq_group *cfqg_of_gse(struct io_sched_entity *se) >>>> +{ >>>> + if (se->is_group_se) >>>> + return container_of(se, struct cfq_group, group_se); >>>> + else >>>> + return NULL; >>>> +} >>>> + >>>> +static inline struct cfq_group *cfqg_of_qse(struct io_sched_entity *se) >>>> +{ >>>> + if (!se->is_group_se) >>>> + return container_of(se, struct cfq_group, queue_se); >>>> + else >>>> + return NULL; >>>> +} >>>> + >>>> static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) >>>> { >>>> if (cfq_class_idle(cfqq)) >>>> @@ -522,12 +558,13 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) >>>> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); >>>> } >>>> >>>> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) >>>> +static inline u64 cfq_scale_slice(unsigned long delta, >>>> + struct io_sched_entity *se) >>>> { >>>> u64 d = delta << CFQ_SERVICE_SHIFT; >>>> >>>> d = d * BLKIO_WEIGHT_DEFAULT; >>>> - do_div(d, cfqg->weight); >>>> + do_div(d, se->weight); >>>> return d; >>>> } >>>> >>>> @@ -552,16 +589,16 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) >>>> static void update_min_vdisktime(struct cfq_rb_root *st) >>>> { >>>> u64 vdisktime = st->min_vdisktime; >>>> - struct cfq_group *cfqg; >>>> + struct io_sched_entity *se; >>>> >>>> if (st->active) { >>>> - cfqg = rb_entry_cfqg(st->active); >>>> - vdisktime = cfqg->vdisktime; >>>> + se = rb_entry_se(st->active); >>>> + vdisktime = se->vdisktime; >>>> } >>>> >>>> if (st->left) { >>>> - cfqg = rb_entry_cfqg(st->left); >>>> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); >>>> + se = rb_entry_se(st->left); >>>> + vdisktime = min_vdisktime(vdisktime, se->vdisktime); >>>> } >>>> >>>> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); >>>> @@ -591,9 +628,10 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, >>>> static inline unsigned >>>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> { >>>> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >>>> + struct io_sched_entity *gse = &cfqg->queue_se; >>>> + struct cfq_rb_root *st = gse->st; >>>> >>>> - return cfq_target_latency * cfqg->weight / st->total_weight; >>>> + return cfq_target_latency * gse->weight / st->total_weight; >>>> } >>>> >>>> static inline void >>>> @@ -756,13 +794,13 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) >>>> return NULL; >>>> } >>>> >>>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) >>>> +static struct io_sched_entity *cfq_rb_first_se(struct cfq_rb_root *root) >>>> { >>>> if (!root->left) >>>> root->left = rb_first(&root->rb); >>>> >>>> if (root->left) >>>> - return rb_entry_cfqg(root->left); >>>> + return rb_entry_se(root->left); >>>> >>>> return NULL; >>>> } >>>> @@ -819,25 +857,25 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, >>>> } >>>> >>>> static inline s64 >>>> -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) >>>> +se_key(struct cfq_rb_root *st, struct io_sched_entity *se) >>>> { >>>> - return cfqg->vdisktime - st->min_vdisktime; >>>> + return se->vdisktime - st->min_vdisktime; >>>> } >>>> >>>> static void >>>> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >>>> +__io_sched_entity_add(struct cfq_rb_root *st, struct io_sched_entity *se) >>>> { >>>> struct rb_node **node = &st->rb.rb_node; >>>> struct rb_node *parent = NULL; >>>> - struct cfq_group *__cfqg; >>>> - s64 key = cfqg_key(st, cfqg); >>>> + struct io_sched_entity *__se; >>>> + s64 key = se_key(st, se); >>>> int left = 1; >>>> >>>> while (*node != NULL) { >>>> parent = *node; >>>> - __cfqg = rb_entry_cfqg(parent); >>>> + __se = rb_entry_se(parent); >>>> >>>> - if (key < cfqg_key(st, __cfqg)) >>>> + if (key < se_key(st, __se)) >>>> node = &parent->rb_left; >>>> else { >>>> node = &parent->rb_right; >>>> @@ -846,47 +884,80 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) >>>> } >>>> >>>> if (left) >>>> - st->left = &cfqg->rb_node; >>>> + st->left = &se->rb_node; >>>> >>>> - rb_link_node(&cfqg->rb_node, parent, node); >>>> - rb_insert_color(&cfqg->rb_node, &st->rb); >>>> + rb_link_node(&se->rb_node, parent, node); >>>> + rb_insert_color(&se->rb_node, &st->rb); >>>> } >>>> >>>> -static void >>>> -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> +static void io_sched_entity_add(struct cfq_rb_root *st, >>>> + struct io_sched_entity *se) >>>> { >>>> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >>>> - struct cfq_group *__cfqg; >>>> struct rb_node *n; >>>> + struct io_sched_entity *__se; >>>> >>>> - cfqg->nr_cfqq++; >>>> - if (cfqg->on_st) >>>> - return; >>>> >>>> + if (se->on_st) >>>> + return; >>>> /* >>>> * Currently put the group at the end. Later implement something >>>> * so that groups get lesser vtime based on their weights, so that >>>> * if group does not loose all if it was not continously backlogged. >>>> */ >>>> - n = rb_last(&st->rb); >>>> + n = rb_last(&se->st->rb); >>>> if (n) { >>>> - __cfqg = rb_entry_cfqg(n); >>>> - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; >>>> + __se = rb_entry_se(n); >>>> + se->vdisktime = __se->vdisktime + CFQ_IDLE_DELAY; >>>> } else >>>> - cfqg->vdisktime = st->min_vdisktime; >>>> + se->vdisktime = st->min_vdisktime; >>>> >>>> - __cfq_group_service_tree_add(st, cfqg); >>>> - cfqg->on_st = true; >>>> - st->total_weight += cfqg->weight; >>>> + __io_sched_entity_add(se->st, se); >>>> + se->on_st = true; >>>> + st->total_weight += se->weight; >>>> +} >>>> + >>>> +static void >>>> +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> +{ >>>> + struct io_sched_entity *gse = &cfqg->group_se; >>>> + struct io_sched_entity *qse = &cfqg->queue_se; >>>> + struct cfq_group *__cfqg; >>>> + >>>> + cfqg->nr_cfqq++; >>>> + >>>> + io_sched_entity_add(qse->st, qse); >>>> + >>>> + while (gse && gse->parent) { >>>> + if (gse->on_st) >>>> + return; >>>> + io_sched_entity_add(gse->st, gse); >>>> + gse = gse->parent; >>>> + __cfqg = cfqg_of_gse(gse); >>>> + __cfqg->nr_subgp++; >>>> + } >>>> +} >>>> + >>>> +static void io_sched_entity_del(struct io_sched_entity *se) >>>> +{ >>>> + if (!RB_EMPTY_NODE(&se->rb_node)) >>>> + cfq_rb_erase(&se->rb_node, se->st); >>>> + >>>> + se->on_st = false; >>>> + se->st->total_weight -= se->weight; >>>> } >>>> >>>> static void >>>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> { >>>> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >>>> + struct io_sched_entity *gse = &cfqg->group_se; >>>> + struct io_sched_entity *qse = &cfqg->queue_se; >>>> + struct cfq_group *__cfqg, *p_cfqg; >>>> + >>>> + if (gse->st && gse->st->active == &gse->rb_node) >>>> + gse->st->active = NULL; >>>> >>>> - if (st->active == &cfqg->rb_node) >>>> - st->active = NULL; >>>> + if (qse->st && qse->st->active == &qse->rb_node) >>>> + qse->st->active = NULL; >>>> >>>> BUG_ON(cfqg->nr_cfqq < 1); >>>> cfqg->nr_cfqq--; >>>> @@ -895,13 +966,25 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> if (cfqg->nr_cfqq) >>>> return; >>>> >>>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); >>>> - cfqg->on_st = false; >>>> - st->total_weight -= cfqg->weight; >>>> - if (!RB_EMPTY_NODE(&cfqg->rb_node)) >>>> - cfq_rb_erase(&cfqg->rb_node, st); >>>> - cfqg->saved_workload_slice = 0; >>>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); >>>> + /* dequeue queue se from group */ >>>> + io_sched_entity_del(qse); >>>> + >>>> + if (cfqg->nr_subgp) >>>> + return; >>>> + >>>> + /* prevent from dequeuing root group */ >>>> + while (gse && gse->parent) { >>>> + __cfqg = cfqg_of_gse(gse); >>>> + p_cfqg = cfqg_of_gse(gse->parent); >>>> + io_sched_entity_del(gse); >>>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1); >>>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group"); >>>> + __cfqg->saved_workload_slice = 0; >>>> + gse = gse->parent; >>>> + p_cfqg->nr_subgp--; >>>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp) >>>> + return; >>>> + } >>>> } >>>> >>>> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >>>> @@ -933,7 +1016,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) >>>> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >>>> struct cfq_queue *cfqq) >>>> { >>>> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >>>> + struct io_sched_entity *gse = &cfqg->group_se; >>>> + struct io_sched_entity *qse = &cfqg->queue_se; >>>> unsigned int used_sl, charge; >>>> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) >>>> - cfqg->service_tree_idle.count; >>>> @@ -946,10 +1030,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >>>> else if (!cfq_cfqq_sync(cfqq) && !nr_sync) >>>> charge = cfqq->allocated_slice; >>>> >>>> - /* Can't update vdisktime while group is on service tree */ >>>> - cfq_rb_erase(&cfqg->rb_node, st); >>>> - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); >>>> - __cfq_group_service_tree_add(st, cfqg); >>>> + /* >>>> + * update queue se's vdisktime. >>>> + * Can't update vdisktime while group is on service tree. >>>> + */ >>>> + >>>> + cfq_rb_erase(&qse->rb_node, qse->st); >>>> + qse->vdisktime += cfq_scale_slice(charge, qse); >>>> + __io_sched_entity_add(qse->st, qse); >>>> + if (&qse->rb_node == qse->st->active) >>>> + qse->st->active = NULL; >>>> + >>>> + while (gse && gse->parent) { >>>> + cfq_rb_erase(&gse->rb_node, gse->st); >>>> + gse->vdisktime += cfq_scale_slice(charge, gse); >>>> + __io_sched_entity_add(gse->st, gse); >>>> + if (&gse->rb_node == gse->st->active) >>>> + gse->st->active = NULL; >>>> + gse = gse->parent; >>>> + } >>>> >>>> /* This group is being expired. Save the context */ >>>> if (time_after(cfqd->workload_expires, jiffies)) { >>>> @@ -960,8 +1059,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, >>>> } else >>>> cfqg->saved_workload_slice = 0; >>>> >>>> - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, >>>> - st->min_vdisktime); >>>> + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", gse->vdisktime, >>>> + gse->st->min_vdisktime); >>>> + >>>> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" >>>> " sect=%u", used_sl, cfqq->slice_dispatch, charge, >>>> iops_mode(cfqd), cfqq->nr_sectors); >>>> @@ -977,39 +1077,84 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) >>>> return NULL; >>>> } >>>> >>>> +static void cfq_put_cfqg(struct cfq_group *cfqg) >>>> +{ >>>> + struct cfq_rb_root *st; >>>> + int i, j; >>>> + struct io_sched_entity *gse; >>>> + struct cfq_group *p_cfqg; >>>> + >>>> + BUG_ON(atomic_read(&cfqg->ref) <= 0); >>>> + if (!atomic_dec_and_test(&cfqg->ref)) >>>> + return; >>>> + for_each_cfqg_st(cfqg, i, j, st) >>>> + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >>>> + >>>> + gse = &cfqg->group_se; >>>> + if (gse->parent) { >>>> + p_cfqg = cfqg_of_gse(gse->parent); >>>> + /* Drop the reference taken by children */ >>>> + atomic_dec(&p_cfqg->ref); >>>> + } >>>> + >>>> + kfree(cfqg); >>>> +} >>>> + >>>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> +{ >>>> + /* Something wrong if we are trying to remove same group twice */ >>>> + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >>>> + >>>> + hlist_del_init(&cfqg->cfqd_node); >>>> + >>>> + /* >>>> + * Put the reference taken at the time of creation so that when all >>>> + * queues are gone, group can be destroyed. >>>> + */ >>>> + cfq_put_cfqg(cfqg); >>>> +} >>>> + >>>> void >>>> cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) >>>> { >>>> - cfqg_of_blkg(blkg)->weight = weight; >>>> + struct cfq_group *cfqg; >>>> + struct io_sched_entity *gse; >>>> + >>>> + cfqg = cfqg_of_blkg(blkg); >>>> + gse = &cfqg->group_se; >>>> + gse->weight = weight; >>>> } >>>> >>>> -static struct cfq_group * >>>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >>>> +static void init_group_queue_entity(struct blkio_cgroup *blkcg, >>>> + struct cfq_group *cfqg) >>>> +{ >>>> + struct io_sched_entity *gse = &cfqg->group_se; >>>> + struct io_sched_entity *qse = &cfqg->queue_se; >>>> + >>>> + gse->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >>>> + RB_CLEAR_NODE(&gse->rb_node); >>>> + gse->is_group_se = true; >>>> + gse->on_st = false; >>>> + >>>> + /* Set to 500 for the time being */ >>>> + qse->weight = 500; >>>> + RB_CLEAR_NODE(&qse->rb_node); >>>> + qse->is_group_se = false; >>>> + qse->on_st = false; >>>> +} >>>> + >>>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg, >>>> + struct cfq_group *cfqg) >>>> { >>>> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >>>> - struct cfq_group *cfqg = NULL; >>>> - void *key = cfqd; >>>> int i, j; >>>> struct cfq_rb_root *st; >>>> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >>>> unsigned int major, minor; >>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >>>> >>>> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >>>> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >>>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >>>> - cfqg->blkg.dev = MKDEV(major, minor); >>>> - goto done; >>>> - } >>>> - if (cfqg || !create) >>>> - goto done; >>>> - >>>> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >>>> - if (!cfqg) >>>> - goto done; >>>> + cfqg->grp_service_tree = CFQ_RB_ROOT; >>>> >>>> for_each_cfqg_st(cfqg, i, j, st) >>>> *st = CFQ_RB_ROOT; >>>> - RB_CLEAR_NODE(&cfqg->rb_node); >>>> >>>> /* >>>> * Take the initial reference that will be released on destroy >>>> @@ -1023,11 +1168,102 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >>>> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >>>> cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, >>>> MKDEV(major, minor)); >>>> - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); >>>> - >>>> + init_group_queue_entity(blkcg, cfqg); >>>> /* Add group on cfqd list */ >>>> hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); >>>> +} >>>> + >>>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> +{ >>>> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) >>>> + cfq_destroy_cfqg(cfqd, cfqg); >>>> +} >>>> + >>>> +static void cfqg_set_parent(struct cfq_group *cfqg, struct cfq_group *p_cfqg) >>>> +{ >>>> + struct io_sched_entity *gse = &cfqg->group_se; >>>> + struct io_sched_entity *qse = &cfqg->queue_se; >>>> + struct io_sched_entity *p_gse = &p_cfqg->group_se; >>>> + >>>> + gse->parent = p_gse; >>>> + gse->st = &p_cfqg->grp_service_tree; >>>> + >>>> + qse->parent = gse; >>>> + qse->st = &cfqg->grp_service_tree; >>>> + >>>> + /* child reference */ >>>> + atomic_inc(&p_cfqg->ref); >>>> +} >>>> + >>>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup) >>>> +{ >>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >>>> + struct blkio_cgroup *p_blkcg; >>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >>>> + unsigned int major, minor; >>>> + struct cfq_group *cfqg, *p_cfqg; >>>> + void *key = cfqd; >>>> + int ret; >>>> + >>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >>>> + if (cfqg) { >>>> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >>>> + cfqg->blkg.dev = MKDEV(major, minor); >>>> + } >>>> + /* chain has already been built */ >>>> + return 0; >>>> + } >>>> + >>>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); >>>> + if (!cfqg) >>>> + return -1; >>>> + >>>> + init_cfqg(cfqd, blkcg, cfqg); >>>> + >>>> + /* Already to the top group */ >>>> + if (!cgroup->parent) >>>> + return 0; >>>> + >>>> + ret = cfqg_chain_alloc(cfqd, cgroup->parent); >>>> + if (ret == -1) { >>>> + uninit_cfqg(cfqd, cfqg); >>>> + return -1; >>>> + } >>>> + >>>> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); >>>> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key)); >>>> + BUG_ON(p_cfqg == NULL); >>>> + >>>> + cfqg_set_parent(cfqg, p_cfqg); >>>> + return 0; >>>> +} >>>> + >>>> +static struct cfq_group * >>>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) >>>> +{ >>>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); >>>> + struct cfq_group *cfqg = NULL; >>>> + void *key = cfqd; >>>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; >>>> + unsigned int major, minor; >>>> + int ret; >>>> >>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >>>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { >>>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); >>>> + cfqg->blkg.dev = MKDEV(major, minor); >>>> + goto done; >>>> + } >>>> + if (cfqg || !create) >>>> + goto done; >>>> + >>>> + ret = cfqg_chain_alloc(cfqd, cgroup); >>>> + if (!ret) { >>>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); >>>> + BUG_ON(cfqg == NULL); >>>> + goto done; >>>> + } >>>> done: >>>> return cfqg; >>>> } >>>> @@ -1067,33 +1303,6 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) >>>> atomic_inc(&cfqq->cfqg->ref); >>>> } >>>> >>>> -static void cfq_put_cfqg(struct cfq_group *cfqg) >>>> -{ >>>> - struct cfq_rb_root *st; >>>> - int i, j; >>>> - >>>> - BUG_ON(atomic_read(&cfqg->ref) <= 0); >>>> - if (!atomic_dec_and_test(&cfqg->ref)) >>>> - return; >>>> - for_each_cfqg_st(cfqg, i, j, st) >>>> - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); >>>> - kfree(cfqg); >>>> -} >>>> - >>>> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> -{ >>>> - /* Something wrong if we are trying to remove same group twice */ >>>> - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); >>>> - >>>> - hlist_del_init(&cfqg->cfqd_node); >>>> - >>>> - /* >>>> - * Put the reference taken at the time of creation so that when all >>>> - * queues are gone, group can be destroyed. >>>> - */ >>>> - cfq_put_cfqg(cfqg); >>>> -} >>>> - >>>> static void cfq_release_cfq_groups(struct cfq_data *cfqd) >>>> { >>>> struct hlist_node *pos, *n; >>>> @@ -1668,9 +1877,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, >>>> if (cfqq == cfqd->active_queue) >>>> cfqd->active_queue = NULL; >>>> >>>> - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) >>>> - cfqd->grp_service_tree.active = NULL; >>>> - >>>> if (cfqd->active_cic) { >>>> put_io_context(cfqd->active_cic->ioc); >>>> cfqd->active_cic = NULL; >>>> @@ -2173,17 +2379,26 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) >>>> cfqd->noidle_tree_requires_idle = false; >>>> } >>>> >>>> + >>>> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) >>>> { >>>> - struct cfq_rb_root *st = &cfqd->grp_service_tree; >>>> + struct cfq_group *root_group = &cfqd->root_group; >>>> + struct cfq_rb_root *st = &root_group->grp_service_tree; >>>> struct cfq_group *cfqg; >>>> + struct io_sched_entity *se; >>>> >>>> - if (RB_EMPTY_ROOT(&st->rb)) >>>> - return NULL; >>>> - cfqg = cfq_rb_first_group(st); >>>> - st->active = &cfqg->rb_node; >>>> - update_min_vdisktime(st); >>>> - return cfqg; >>>> + do { >>>> + se = cfq_rb_first_se(st); >>>> + if (!se) >>>> + return NULL; >>>> + st->active = &se->rb_node; >>>> + update_min_vdisktime(st); >>>> + cfqg = cfqg_of_qse(se); >>>> + if (cfqg) >>>> + return cfqg; >>>> + cfqg = cfqg_of_gse(se); >>>> + st = &cfqg->grp_service_tree; >>>> + } while (1); >>>> } >>>> >>>> static void cfq_choose_cfqg(struct cfq_data *cfqd) >>>> @@ -2215,15 +2430,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >>>> if (!cfqq) >>>> goto new_queue; >>>> >>>> + >>>> if (!cfqd->rq_queued) >>>> return NULL; >>>> >>>> + >>>> /* >>>> * We were waiting for group to get backlogged. Expire the queue >>>> */ >>>> if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) >>>> goto expire; >>>> >>>> + >>>> /* >>>> * The active queue has run out of time, expire it and select new. >>>> */ >>>> @@ -2245,6 +2463,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >>>> goto check_group_idle; >>>> } >>>> >>>> + >>>> /* >>>> * The active queue has requests and isn't expired, allow it to >>>> * dispatch. >>>> @@ -2252,6 +2471,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) >>>> if (!RB_EMPTY_ROOT(&cfqq->sort_list)) >>>> goto keep_queue; >>>> >>>> + >>>> /* >>>> * If another queue has a request waiting within our mean seek >>>> * distance, let it run. The expire code will check for close >>>> @@ -2505,6 +2725,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) >>>> } >>>> >>>> cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); >>>> + >>>> return 1; >>>> } >>>> >>>> @@ -3854,17 +4075,25 @@ static void *cfq_init_queue(struct request_queue *q) >>>> >>>> cfqd->cic_index = i; >>>> >>>> - /* Init root service tree */ >>>> - cfqd->grp_service_tree = CFQ_RB_ROOT; >>>> - >>>> /* Init root group */ >>>> cfqg = &cfqd->root_group; >>>> + cfqg->grp_service_tree = CFQ_RB_ROOT; >>>> for_each_cfqg_st(cfqg, i, j, st) >>>> *st = CFQ_RB_ROOT; >>>> - RB_CLEAR_NODE(&cfqg->rb_node); >>>> - >>>> + cfqg->group_se.is_group_se = true; >>>> + RB_CLEAR_NODE(&cfqg->group_se.rb_node); >>>> + cfqg->group_se.on_st = false; >>>> /* Give preference to root group over other groups */ >>>> - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; >>>> + cfqg->group_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >>>> + cfqg->group_se.parent = NULL; >>>> + cfqg->group_se.st = NULL; >>>> + >>>> + cfqg->queue_se.is_group_se = false; >>>> + RB_CLEAR_NODE(&cfqg->queue_se.rb_node); >>>> + cfqg->queue_se.on_st = false; >>>> + cfqg->queue_se.weight = 2*BLKIO_WEIGHT_DEFAULT; >>>> + cfqg->queue_se.parent = &cfqg->group_se; >>>> + cfqg->queue_se.st = &cfqg->grp_service_tree; >>>> >>>> #ifdef CONFIG_CFQ_GROUP_IOSCHED >>>> /* >>>> -- >>>> 1.6.5.2 >>>> >>>> > > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 8:48 ` Gui Jianfeng @ 2010-09-01 9:02 ` KAMEZAWA Hiroyuki 2010-09-02 2:29 ` Vivek Goyal 0 siblings, 1 reply; 19+ messages in thread From: KAMEZAWA Hiroyuki @ 2010-09-01 9:02 UTC (permalink / raw) To: Gui Jianfeng Cc: Vivek Goyal, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Wed, 01 Sep 2010 16:48:25 +0800 Gui Jianfeng <guijianfeng@cn.fujitsu.com> wrote: > Add Kamezawa > > Vivek Goyal wrote: > > On Tue, Aug 31, 2010 at 08:29:20AM +0800, Gui Jianfeng wrote: > >> Vivek Goyal wrote: > >>> On Mon, Aug 30, 2010 at 02:50:40PM +0800, Gui Jianfeng wrote: > >>>> Hi All, > >>>> > >>>> This patch enables cfq group hierarchical scheduling. > >>>> > >>>> With this patch, you can create a cgroup directory deeper than level 1. > >>>> Now, I/O Bandwidth is distributed in a hierarchy way. For example: > >>>> We create cgroup directories as following(the number represents weight): > >>>> > >>>> Root grp > >>>> / \ > >>>> grp_1(100) grp_2(400) > >>>> / \ > >>>> grp_3(200) grp_4(300) > >>>> > >>>> If grp_2 grp_3 and grp_4 are contending for I/O Bandwidth, > >>>> grp_2 will share 80% of total bandwidth. > >>>> For sub_groups, grp_3 shares 8%(20% * 40%), grp_4 shares 12%(20% * 60%) > >>>> > >>>> Design: > >>>> o Each cfq group has its own group service tree. > >>>> o Each cfq group contains a "group schedule entity" (gse) that > >>>> schedules on parent cfq group's service tree. > >>>> o Each cfq group contains a "queue schedule entity"(qse), it > >>>> represents all cfqqs located on this cfq group. It schedules > >>>> on this group's service tree. For the time being, root group > >>>> qse's weight is 1000, and subgroup qse's weight is 500. > >>>> o All gses and qse which belones to a same cfq group schedules > >>>> on the same group service tree. > >>> Hi Gui, > >>> > >>> Thanks for the patch. I have few questions. > >>> > >>> - So how does the hierarchy look like, w.r.t root group. Something as > >>> follows? > >>> > >>> > >>> root > >>> / | \ > >>> q1 q2 G1 > >>> > >>> Assume there are two processes doin IO in root group and q1 and q2 are > >>> cfqq queues for those processes and G1 is the cgroup created by user. > >>> > >>> If yes, then what algorithm do you use to do scheduling between q1, q2 > >>> and G1? IOW, currently we have two algorithms operating in CFQ. One for > >>> cfqq and other for groups. Group algorithm does not use the logic of > >>> cfq_slice_offset(). > >> Hi Vivek, > >> > >> This patch doesn't break the original sheduling logic. That is cfqg => st => cfqq. > >> If q1 and q2 in root group, I treat q1 and q2 bundle as a queue sched entity, and > >> it will schedule on root group service with G1, as following: > >> > >> root group > >> / \ > >> qse(q1,q2) gse(G1) > >> > > > > Ok. That's interesting. That raises another question that how hierarchy > > should look like. IOW, how queue and groups should be treated in > > hierarchy. > > > > CFS cpu scheduler treats queues and group at the same level. That is as > > follows. > > > > root > > / | \ > > q1 q2 G1 > > > > In the past I had raised this question and Jens and corrado liked treating > > queues and group at same level. > > > > Logically, q1, q2 and G1 are all children of root, so it makes sense to > > treat them at same level and not group q1 and q2 in to a single entity and > > group. > > Hi Vivek, > > IMO, this new approach keeps the original scheduling logic, and keep things > simple. And to me, this approach works for me. So i choose it. > > > > > One of the possible way forward could be this. > > > > - Treat queue and group at same level (like CFS) > > > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > > will be no ioprio difference between cfq queues. I think anyway as of > > today that logic helps in so little situations that I would not mind > > getting rid of it. Just that Jens should agree to it. > > > > - With this new scheme, it will break the existing semantics of root group > > being at same level as child groups. To avoid that, we can probably > > implement two modes (flat and hierarchical), something similar to what > > memory cgroup controller has done. May be one tunable in root cgroup of > > blkio "use_hierarchy". By default everything will be in flat mode and > > if user wants hiearchical control, he needs to set user_hierarchy in > > root group. > > > > I think memory controller provides "use_hierarchy" tunable in each > > cgroup. I am not sure why do we need it in each cgroup and not just > > in root cgroup. > > I think Kamezawa-san should be able to answer this question. :) > At first, please be sure that "hierarchical accounting is _very_ slow". Please measure how hierarchical accounting (of 4-6 levels) are slow ;) Then, there are 2 use cases. 1) root/to/some/directory/A /B /C .... All A, B, C ....are flat cgroup and has no relationship, not sharing limit. In this case, hierarchy should not be enabled. 2) root/to/some/directory/Gold/A,B,C... Silver/D,E,F All A, B, C ....are limited by "Gold" or "Silver". But Gold and Silver has no relationthip, they has their own limitations. But A, B, C, D, E, F shares limit under Gold or Silver. In this case, hierarchy "root/to/some/directory" should be disabled. Gold/ and Silver should have use_hierarchy=1. (Assume these Gold and Silver as Container and the user of container divides memory into A and B, C...) For example, libvirt makes very long "root/to/some/directory" ... I never want to count-up all counters in the hierarchy even if we'd like to use some fantasic hierarchical accounting under a container. I don't like "all or nothing" option (as making use_hierarchy as mount option or has parameter on root cgroup etc..) Then, allowed mixture. Thanks, -Kame ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-01 9:02 ` KAMEZAWA Hiroyuki @ 2010-09-02 2:29 ` Vivek Goyal 2010-09-02 2:42 ` KAMEZAWA Hiroyuki 0 siblings, 1 reply; 19+ messages in thread From: Vivek Goyal @ 2010-09-02 2:29 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Wed, Sep 01, 2010 at 06:02:43PM +0900, KAMEZAWA Hiroyuki wrote: [..] > > > One of the possible way forward could be this. > > > > > > - Treat queue and group at same level (like CFS) > > > > > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > > > will be no ioprio difference between cfq queues. I think anyway as of > > > today that logic helps in so little situations that I would not mind > > > getting rid of it. Just that Jens should agree to it. > > > > > > - With this new scheme, it will break the existing semantics of root group > > > being at same level as child groups. To avoid that, we can probably > > > implement two modes (flat and hierarchical), something similar to what > > > memory cgroup controller has done. May be one tunable in root cgroup of > > > blkio "use_hierarchy". By default everything will be in flat mode and > > > if user wants hiearchical control, he needs to set user_hierarchy in > > > root group. > > > > > > I think memory controller provides "use_hierarchy" tunable in each > > > cgroup. I am not sure why do we need it in each cgroup and not just > > > in root cgroup. > > > > I think Kamezawa-san should be able to answer this question. :) > > > > At first, please be sure that "hierarchical accounting is _very_ slow". > Please measure how hierarchical accounting (of 4-6 levels) are slow ;) > > Then, there are 2 use cases. > > 1) root/to/some/directory/A > /B > /C > .... > All A, B, C ....are flat cgroup and has no relationship, not sharing limit. > In this case, hierarchy should not be enabled. > > 2) root/to/some/directory/Gold/A,B,C... > Silver/D,E,F > > All A, B, C ....are limited by "Gold" or "Silver". > But Gold and Silver has no relationthip, they has their own limitations. > But A, B, C, D, E, F shares limit under Gold or Silver. > In this case, hierarchy > "root/to/some/directory" should be disabled. > Gold/ and Silver should have use_hierarchy=1. > > (Assume these Gold and Silver as Container and the user of container > divides memory into A and B, C...) > > For example, libvirt makes very long "root/to/some/directory" ... > I never want to count-up all counters in the hierarchy even if > we'd like to use some fantasic hierarchical accounting under a container. > > I don't like "all or nothing" option (as making use_hierarchy as mount > option or has parameter on root cgroup etc..) Then, allowed mixture. Hi Kame San, If you don't want any relationship between Gold and Silver then one can make root as unlimited group (limit_in_bytes = -1) and practically Gold and Silver have no dependency. There is no need of setting use_hierarchy different at root level and inside Gold/ and Silver/ groups? It sounds like you did it for two reasons. - It can potentially provide more flexibility. - performance reason so that you can stop do hierarchical accounting all the way to root and stop before that (libvirt example). I think for blkio controller we can probably begin with either a mount time option or a use_hierachy file in root group and then later make it per group if there are use cases. Vivek ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support 2010-09-02 2:29 ` Vivek Goyal @ 2010-09-02 2:42 ` KAMEZAWA Hiroyuki 0 siblings, 0 replies; 19+ messages in thread From: KAMEZAWA Hiroyuki @ 2010-09-02 2:42 UTC (permalink / raw) To: Vivek Goyal Cc: Gui Jianfeng, Jens Axboe, Jeff Moyer, Divyesh Shah, Corrado Zoccolo, Nauman Rafique, linux kernel mailing list On Wed, 1 Sep 2010 22:29:36 -0400 Vivek Goyal <vgoyal@redhat.com> wrote: > On Wed, Sep 01, 2010 at 06:02:43PM +0900, KAMEZAWA Hiroyuki wrote: > > [..] > > > > One of the possible way forward could be this. > > > > > > > > - Treat queue and group at same level (like CFS) > > > > > > > > - Get rid of cfq_slice_offset() logic. That means without idling on, there > > > > will be no ioprio difference between cfq queues. I think anyway as of > > > > today that logic helps in so little situations that I would not mind > > > > getting rid of it. Just that Jens should agree to it. > > > > > > > > - With this new scheme, it will break the existing semantics of root group > > > > being at same level as child groups. To avoid that, we can probably > > > > implement two modes (flat and hierarchical), something similar to what > > > > memory cgroup controller has done. May be one tunable in root cgroup of > > > > blkio "use_hierarchy". By default everything will be in flat mode and > > > > if user wants hiearchical control, he needs to set user_hierarchy in > > > > root group. > > > > > > > > I think memory controller provides "use_hierarchy" tunable in each > > > > cgroup. I am not sure why do we need it in each cgroup and not just > > > > in root cgroup. > > > > > > I think Kamezawa-san should be able to answer this question. :) > > > > > > > At first, please be sure that "hierarchical accounting is _very_ slow". > > Please measure how hierarchical accounting (of 4-6 levels) are slow ;) > > > > Then, there are 2 use cases. > > > > 1) root/to/some/directory/A > > /B > > /C > > .... > > All A, B, C ....are flat cgroup and has no relationship, not sharing limit. > > In this case, hierarchy should not be enabled. > > > > 2) root/to/some/directory/Gold/A,B,C... > > Silver/D,E,F > > > > All A, B, C ....are limited by "Gold" or "Silver". > > But Gold and Silver has no relationthip, they has their own limitations. > > But A, B, C, D, E, F shares limit under Gold or Silver. > > In this case, hierarchy > > "root/to/some/directory" should be disabled. > > Gold/ and Silver should have use_hierarchy=1. > > > > (Assume these Gold and Silver as Container and the user of container > > divides memory into A and B, C...) > > > > For example, libvirt makes very long "root/to/some/directory" ... > > I never want to count-up all counters in the hierarchy even if > > we'd like to use some fantasic hierarchical accounting under a container. > > > > I don't like "all or nothing" option (as making use_hierarchy as mount > > option or has parameter on root cgroup etc..) Then, allowed mixture. > > Hi Kame San, > > If you don't want any relationship between Gold and Silver then one can > make root as unlimited group (limit_in_bytes = -1) and practically Gold > and Silver have no dependency. There is no need of setting use_hierarchy > different at root level and inside Gold/ and Silver/ groups? > and counts up 4 levels accounting ? ;) We allow mixuture of /root/to/some/directory/Gold/ /Silver /Extraone Gold and Silver can be under some limitation, of course. (For example, Extraone is for system-admin and not-for-user. System admin is in another container than users.) > It sounds like you did it for two reasons. > > - It can potentially provide more flexibility. Right. > - performance reason so that you can stop do hierarchical accounting > all the way to root and stop before that (libvirt example). Yes. > > I think for blkio controller we can probably begin with either a mount > time option or a use_hierachy file in root group and then later make > it per group if there are use cases. > I hope something flexible. Complexity to the code is not very big. Thanks, -Kame ^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2010-09-02 2:47 UTC | newest] Thread overview: 19+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-08-30 6:50 [RFC] [PATCH] cfq-iosched: add cfq group hierarchical scheduling support Gui Jianfeng 2010-08-30 18:20 ` Chad Talbott 2010-08-31 0:35 ` Gui Jianfeng 2010-08-30 20:36 ` Vivek Goyal 2010-08-31 0:29 ` Gui Jianfeng 2010-08-31 12:57 ` Vivek Goyal 2010-08-31 15:40 ` Nauman Rafique 2010-08-31 19:25 ` Vivek Goyal 2010-09-01 8:50 ` Gui Jianfeng 2010-09-01 15:49 ` Nauman Rafique 2010-09-01 17:10 ` Vivek Goyal 2010-09-01 17:15 ` Nauman Rafique 2010-09-01 17:21 ` Vivek Goyal 2010-09-02 0:30 ` Gui Jianfeng 2010-09-02 2:20 ` Vivek Goyal 2010-09-01 8:48 ` Gui Jianfeng 2010-09-01 9:02 ` KAMEZAWA Hiroyuki 2010-09-02 2:29 ` Vivek Goyal 2010-09-02 2:42 ` KAMEZAWA Hiroyuki
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox