* [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
@ 2010-12-13 1:44 ` Gui Jianfeng
2010-12-13 15:44 ` Vivek Goyal
2010-12-13 1:44 ` [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
` (6 subsequent siblings)
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:44 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Introduce cfq_entity for CFQ queue
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/cfq-iosched.c | 125 +++++++++++++++++++++++++++++++++-----------------
1 files changed, 82 insertions(+), 43 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5d0349d..9b07a24 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -91,20 +91,31 @@ struct cfq_rb_root {
#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
.count = 0, .min_vdisktime = 0, }
+
+/*
+ * This's the CFQ queue schedule entity which is scheduled on service tree.
+ */
+struct cfq_entity {
+ /* service tree */
+ struct cfq_rb_root *service_tree;
+ /* service_tree member */
+ struct rb_node rb_node;
+ /* service_tree key, represent the position on the tree */
+ unsigned long rb_key;
+};
+
/*
* Per process-grouping structure
*/
struct cfq_queue {
+ /* The schedule entity */
+ struct cfq_entity cfqe;
/* reference count */
atomic_t ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
- /* service_tree member */
- struct rb_node rb_node;
- /* service_tree key */
- unsigned long rb_key;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
@@ -143,7 +154,6 @@ struct cfq_queue {
u32 seek_history;
sector_t last_request_pos;
- struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
struct cfq_group *orig_cfqg;
@@ -302,6 +312,15 @@ struct cfq_data {
struct rcu_head rcu;
};
+static inline struct cfq_queue *
+cfqq_of_entity(struct cfq_entity *cfqe)
+{
+ if (cfqe)
+ return container_of(cfqe, struct cfq_queue,
+ cfqe);
+ return NULL;
+}
+
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -743,7 +762,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
/*
* The below is leftmost cache rbtree addon
*/
-static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
{
/* Service tree is empty */
if (!root->count)
@@ -753,7 +772,7 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
root->left = rb_first(&root->rb);
if (root->left)
- return rb_entry(root->left, struct cfq_queue, rb_node);
+ return rb_entry(root->left, struct cfq_entity, rb_node);
return NULL;
}
@@ -1170,21 +1189,24 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
bool add_front)
{
+ struct cfq_entity *cfqe;
struct rb_node **p, *parent;
- struct cfq_queue *__cfqq;
+ struct cfq_entity *__cfqe;
unsigned long rb_key;
struct cfq_rb_root *service_tree;
int left;
int new_cfqq = 1;
int group_changed = 0;
+ cfqe = &cfqq->cfqe;
+
#ifdef CONFIG_CFQ_GROUP_IOSCHED
if (!cfqd->cfq_group_isolation
&& cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
/* Move this cfq to root group */
cfq_log_cfqq(cfqd, cfqq, "moving to root group");
- if (!RB_EMPTY_NODE(&cfqq->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
cfqq->orig_cfqg = cfqq->cfqg;
cfqq->cfqg = &cfqd->root_group;
@@ -1194,7 +1216,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
/* cfqq is sequential now needs to go to its original group */
BUG_ON(cfqq->cfqg != &cfqd->root_group);
- if (!RB_EMPTY_NODE(&cfqq->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
cfq_put_cfqg(cfqq->cfqg);
cfqq->cfqg = cfqq->orig_cfqg;
@@ -1209,9 +1231,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfq_class_idle(cfqq)) {
rb_key = CFQ_IDLE_DELAY;
parent = rb_last(&service_tree->rb);
- if (parent && parent != &cfqq->rb_node) {
- __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
- rb_key += __cfqq->rb_key;
+ if (parent && parent != &cfqe->rb_node) {
+ __cfqe = rb_entry(parent,
+ struct cfq_entity,
+ rb_node);
+ rb_key += __cfqe->rb_key;
} else
rb_key += jiffies;
} else if (!add_front) {
@@ -1226,37 +1250,39 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->slice_resid = 0;
} else {
rb_key = -HZ;
- __cfqq = cfq_rb_first(service_tree);
- rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+ __cfqe = cfq_rb_first(service_tree);
+ rb_key += __cfqe ? __cfqe->rb_key : jiffies;
}
- if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
new_cfqq = 0;
/*
* same position, nothing more to do
*/
- if (rb_key == cfqq->rb_key &&
- cfqq->service_tree == service_tree)
+ if (rb_key == cfqe->rb_key &&
+ cfqe->service_tree == service_tree)
return;
- cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
- cfqq->service_tree = NULL;
+ cfq_rb_erase(&cfqe->rb_node,
+ cfqe->service_tree);
+ cfqe->service_tree = NULL;
}
left = 1;
parent = NULL;
- cfqq->service_tree = service_tree;
+ cfqe->service_tree = service_tree;
p = &service_tree->rb.rb_node;
while (*p) {
struct rb_node **n;
parent = *p;
- __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ __cfqe = rb_entry(parent, struct cfq_entity,
+ rb_node);
/*
* sort by key, that represents service time.
*/
- if (time_before(rb_key, __cfqq->rb_key))
+ if (time_before(rb_key, __cfqe->rb_key))
n = &(*p)->rb_left;
else {
n = &(*p)->rb_right;
@@ -1267,11 +1293,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
if (left)
- service_tree->left = &cfqq->rb_node;
+ service_tree->left = &cfqe->rb_node;
- cfqq->rb_key = rb_key;
- rb_link_node(&cfqq->rb_node, parent, p);
- rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+ cfqe->rb_key = rb_key;
+ rb_link_node(&cfqe->rb_node, parent, p);
+ rb_insert_color(&cfqe->rb_node, &service_tree->rb);
service_tree->count++;
if ((add_front || !new_cfqq) && !group_changed)
return;
@@ -1373,13 +1399,17 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
*/
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ struct cfq_entity *cfqe;
cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);
- if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
- cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
- cfqq->service_tree = NULL;
+ cfqe = &cfqq->cfqe;
+
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+ cfq_rb_erase(&cfqe->rb_node,
+ cfqe->service_tree);
+ cfqe->service_tree = NULL;
}
if (cfqq->p_root) {
rb_erase(&cfqq->p_node, cfqq->p_root);
@@ -1707,13 +1737,13 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
return NULL;
if (RB_EMPTY_ROOT(&service_tree->rb))
return NULL;
- return cfq_rb_first(service_tree);
+ return cfqq_of_entity(cfq_rb_first(service_tree));
}
static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
{
struct cfq_group *cfqg;
- struct cfq_queue *cfqq;
+ struct cfq_entity *cfqe;
int i, j;
struct cfq_rb_root *st;
@@ -1724,9 +1754,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
if (!cfqg)
return NULL;
- for_each_cfqg_st(cfqg, i, j, st)
- if ((cfqq = cfq_rb_first(st)) != NULL)
- return cfqq;
+ for_each_cfqg_st(cfqg, i, j, st) {
+ cfqe = cfq_rb_first(st);
+ if (cfqe != NULL)
+ return cfqq_of_entity(cfqe);
+ }
return NULL;
}
@@ -1863,9 +1895,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ struct cfq_entity *cfqe;
enum wl_prio_t prio = cfqq_prio(cfqq);
- struct cfq_rb_root *service_tree = cfqq->service_tree;
+ struct cfq_rb_root *service_tree;
+ cfqe = &cfqq->cfqe;
+ service_tree = cfqe->service_tree;
BUG_ON(!service_tree);
BUG_ON(!service_tree->count);
@@ -2075,7 +2110,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
- struct cfq_queue *queue;
+ struct cfq_entity *cfqe;
int i;
bool key_valid = false;
unsigned long lowest_key = 0;
@@ -2083,10 +2118,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
/* select the one with lowest rb_key */
- queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
- if (queue &&
- (!key_valid || time_before(queue->rb_key, lowest_key))) {
- lowest_key = queue->rb_key;
+ cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
+ if (cfqe &&
+ (!key_valid ||
+ time_before(cfqe->rb_key, lowest_key))) {
+ lowest_key = cfqe->rb_key;
cur_best = i;
key_valid = true;
}
@@ -2834,7 +2870,10 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
pid_t pid, bool is_sync)
{
- RB_CLEAR_NODE(&cfqq->rb_node);
+ struct cfq_entity *cfqe;
+
+ cfqe = &cfqq->cfqe;
+ RB_CLEAR_NODE(&cfqe->rb_node);
RB_CLEAR_NODE(&cfqq->p_node);
INIT_LIST_HEAD(&cfqq->fifo);
@@ -3243,7 +3282,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
/* Allow preemption only if we are idling on sync-noidle tree */
if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
- new_cfqq->service_tree->count == 2 &&
+ new_cfqq->cfqe.service_tree->count == 2 &&
RB_EMPTY_ROOT(&cfqq->sort_list))
return true;
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue
2010-12-13 1:44 ` [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
@ 2010-12-13 15:44 ` Vivek Goyal
2010-12-14 1:30 ` Gui Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Vivek Goyal @ 2010-12-13 15:44 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:44:24AM +0800, Gui Jianfeng wrote:
> Introduce cfq_entity for CFQ queue
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/cfq-iosched.c | 125 +++++++++++++++++++++++++++++++++-----------------
> 1 files changed, 82 insertions(+), 43 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 5d0349d..9b07a24 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -91,20 +91,31 @@ struct cfq_rb_root {
> #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
> .count = 0, .min_vdisktime = 0, }
>
> +
> +/*
> + * This's the CFQ queue schedule entity which is scheduled on service tree.
> + */
> +struct cfq_entity {
> + /* service tree */
> + struct cfq_rb_root *service_tree;
> + /* service_tree member */
> + struct rb_node rb_node;
> + /* service_tree key, represent the position on the tree */
> + unsigned long rb_key;
> +};
> +
> /*
> * Per process-grouping structure
> */
> struct cfq_queue {
> + /* The schedule entity */
> + struct cfq_entity cfqe;
> /* reference count */
> atomic_t ref;
> /* various state flags, see below */
> unsigned int flags;
> /* parent cfq_data */
> struct cfq_data *cfqd;
> - /* service_tree member */
> - struct rb_node rb_node;
> - /* service_tree key */
> - unsigned long rb_key;
> /* prio tree member */
> struct rb_node p_node;
> /* prio tree root we belong to, if any */
> @@ -143,7 +154,6 @@ struct cfq_queue {
> u32 seek_history;
> sector_t last_request_pos;
>
> - struct cfq_rb_root *service_tree;
> struct cfq_queue *new_cfqq;
> struct cfq_group *cfqg;
> struct cfq_group *orig_cfqg;
> @@ -302,6 +312,15 @@ struct cfq_data {
> struct rcu_head rcu;
> };
>
> +static inline struct cfq_queue *
> +cfqq_of_entity(struct cfq_entity *cfqe)
> +{
> + if (cfqe)
> + return container_of(cfqe, struct cfq_queue,
> + cfqe);
> + return NULL;
> +}
> +
> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>
> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> @@ -743,7 +762,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
> /*
> * The below is leftmost cache rbtree addon
> */
> -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
> +static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> {
> /* Service tree is empty */
> if (!root->count)
> @@ -753,7 +772,7 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
> root->left = rb_first(&root->rb);
>
> if (root->left)
> - return rb_entry(root->left, struct cfq_queue, rb_node);
> + return rb_entry(root->left, struct cfq_entity, rb_node);
>
> return NULL;
> }
> @@ -1170,21 +1189,24 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
> static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> bool add_front)
> {
> + struct cfq_entity *cfqe;
> struct rb_node **p, *parent;
> - struct cfq_queue *__cfqq;
> + struct cfq_entity *__cfqe;
> unsigned long rb_key;
> struct cfq_rb_root *service_tree;
> int left;
> int new_cfqq = 1;
> int group_changed = 0;
>
> + cfqe = &cfqq->cfqe;
> +
> #ifdef CONFIG_CFQ_GROUP_IOSCHED
> if (!cfqd->cfq_group_isolation
> && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
> && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
> /* Move this cfq to root group */
> cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> - if (!RB_EMPTY_NODE(&cfqq->rb_node))
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> cfqq->orig_cfqg = cfqq->cfqg;
> cfqq->cfqg = &cfqd->root_group;
> @@ -1194,7 +1216,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
> /* cfqq is sequential now needs to go to its original group */
> BUG_ON(cfqq->cfqg != &cfqd->root_group);
> - if (!RB_EMPTY_NODE(&cfqq->rb_node))
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> cfq_put_cfqg(cfqq->cfqg);
> cfqq->cfqg = cfqq->orig_cfqg;
> @@ -1209,9 +1231,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> if (cfq_class_idle(cfqq)) {
> rb_key = CFQ_IDLE_DELAY;
> parent = rb_last(&service_tree->rb);
> - if (parent && parent != &cfqq->rb_node) {
> - __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
> - rb_key += __cfqq->rb_key;
> + if (parent && parent != &cfqe->rb_node) {
> + __cfqe = rb_entry(parent,
> + struct cfq_entity,
> + rb_node);
Above can fit into a single line or at max two lines?
> + rb_key += __cfqe->rb_key;
> } else
> rb_key += jiffies;
> } else if (!add_front) {
> @@ -1226,37 +1250,39 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> cfqq->slice_resid = 0;
> } else {
> rb_key = -HZ;
> - __cfqq = cfq_rb_first(service_tree);
> - rb_key += __cfqq ? __cfqq->rb_key : jiffies;
> + __cfqe = cfq_rb_first(service_tree);
> + rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> }
>
> - if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> new_cfqq = 0;
> /*
> * same position, nothing more to do
> */
> - if (rb_key == cfqq->rb_key &&
> - cfqq->service_tree == service_tree)
> + if (rb_key == cfqe->rb_key &&
> + cfqe->service_tree == service_tree)
> return;
>
> - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
> - cfqq->service_tree = NULL;
> + cfq_rb_erase(&cfqe->rb_node,
> + cfqe->service_tree);
Above can fit on single line?
> + cfqe->service_tree = NULL;
> }
>
> left = 1;
> parent = NULL;
> - cfqq->service_tree = service_tree;
> + cfqe->service_tree = service_tree;
> p = &service_tree->rb.rb_node;
> while (*p) {
> struct rb_node **n;
>
> parent = *p;
> - __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
> + __cfqe = rb_entry(parent, struct cfq_entity,
> + rb_node);
Single line.
>
> /*
> * sort by key, that represents service time.
> */
> - if (time_before(rb_key, __cfqq->rb_key))
> + if (time_before(rb_key, __cfqe->rb_key))
> n = &(*p)->rb_left;
> else {
> n = &(*p)->rb_right;
> @@ -1267,11 +1293,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> }
>
> if (left)
> - service_tree->left = &cfqq->rb_node;
> + service_tree->left = &cfqe->rb_node;
>
> - cfqq->rb_key = rb_key;
> - rb_link_node(&cfqq->rb_node, parent, p);
> - rb_insert_color(&cfqq->rb_node, &service_tree->rb);
> + cfqe->rb_key = rb_key;
> + rb_link_node(&cfqe->rb_node, parent, p);
> + rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> service_tree->count++;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> @@ -1373,13 +1399,17 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> */
> static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> + struct cfq_entity *cfqe;
> cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
> BUG_ON(!cfq_cfqq_on_rr(cfqq));
> cfq_clear_cfqq_on_rr(cfqq);
>
> - if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
> - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
> - cfqq->service_tree = NULL;
> + cfqe = &cfqq->cfqe;
> +
> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> + cfq_rb_erase(&cfqe->rb_node,
> + cfqe->service_tree);
Single line above.
> + cfqe->service_tree = NULL;
> }
> if (cfqq->p_root) {
> rb_erase(&cfqq->p_node, cfqq->p_root);
> @@ -1707,13 +1737,13 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> return NULL;
> if (RB_EMPTY_ROOT(&service_tree->rb))
> return NULL;
> - return cfq_rb_first(service_tree);
> + return cfqq_of_entity(cfq_rb_first(service_tree));
> }
>
> static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> {
> struct cfq_group *cfqg;
> - struct cfq_queue *cfqq;
> + struct cfq_entity *cfqe;
> int i, j;
> struct cfq_rb_root *st;
>
> @@ -1724,9 +1754,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> if (!cfqg)
> return NULL;
>
> - for_each_cfqg_st(cfqg, i, j, st)
> - if ((cfqq = cfq_rb_first(st)) != NULL)
> - return cfqq;
> + for_each_cfqg_st(cfqg, i, j, st) {
> + cfqe = cfq_rb_first(st);
> + if (cfqe != NULL)
> + return cfqq_of_entity(cfqe);
> + }
> return NULL;
> }
>
> @@ -1863,9 +1895,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
>
> static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> + struct cfq_entity *cfqe;
> enum wl_prio_t prio = cfqq_prio(cfqq);
> - struct cfq_rb_root *service_tree = cfqq->service_tree;
> + struct cfq_rb_root *service_tree;
>
> + cfqe = &cfqq->cfqe;
> + service_tree = cfqe->service_tree;
> BUG_ON(!service_tree);
> BUG_ON(!service_tree->count);
>
> @@ -2075,7 +2110,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> struct cfq_group *cfqg, enum wl_prio_t prio)
> {
> - struct cfq_queue *queue;
> + struct cfq_entity *cfqe;
> int i;
> bool key_valid = false;
> unsigned long lowest_key = 0;
> @@ -2083,10 +2118,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>
> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> /* select the one with lowest rb_key */
> - queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - if (queue &&
> - (!key_valid || time_before(queue->rb_key, lowest_key))) {
> - lowest_key = queue->rb_key;
> + cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> + if (cfqe &&
> + (!key_valid ||
> + time_before(cfqe->rb_key, lowest_key))) {
Merge two lines into one above.
> + lowest_key = cfqe->rb_key;
> cur_best = i;
> key_valid = true;
> }
> @@ -2834,7 +2870,10 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
> static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> pid_t pid, bool is_sync)
> {
> - RB_CLEAR_NODE(&cfqq->rb_node);
> + struct cfq_entity *cfqe;
> +
> + cfqe = &cfqq->cfqe;
> + RB_CLEAR_NODE(&cfqe->rb_node);
> RB_CLEAR_NODE(&cfqq->p_node);
> INIT_LIST_HEAD(&cfqq->fifo);
>
> @@ -3243,7 +3282,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
> /* Allow preemption only if we are idling on sync-noidle tree */
> if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
> cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
> - new_cfqq->service_tree->count == 2 &&
> + new_cfqq->cfqe.service_tree->count == 2 &&
> RB_EMPTY_ROOT(&cfqq->sort_list))
> return true;
>
Apart from above minor nits, this patch looks good to me.
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue
2010-12-13 15:44 ` Vivek Goyal
@ 2010-12-14 1:30 ` Gui Jianfeng
0 siblings, 0 replies; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 1:30 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:44:24AM +0800, Gui Jianfeng wrote:
>> Introduce cfq_entity for CFQ queue
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/cfq-iosched.c | 125 +++++++++++++++++++++++++++++++++-----------------
>> 1 files changed, 82 insertions(+), 43 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 5d0349d..9b07a24 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -91,20 +91,31 @@ struct cfq_rb_root {
>> #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
>> .count = 0, .min_vdisktime = 0, }
>>
>> +
>> +/*
>> + * This's the CFQ queue schedule entity which is scheduled on service tree.
>> + */
>> +struct cfq_entity {
>> + /* service tree */
>> + struct cfq_rb_root *service_tree;
>> + /* service_tree member */
>> + struct rb_node rb_node;
>> + /* service_tree key, represent the position on the tree */
>> + unsigned long rb_key;
>> +};
>> +
>> /*
>> * Per process-grouping structure
>> */
>> struct cfq_queue {
>> + /* The schedule entity */
>> + struct cfq_entity cfqe;
>> /* reference count */
>> atomic_t ref;
>> /* various state flags, see below */
>> unsigned int flags;
>> /* parent cfq_data */
>> struct cfq_data *cfqd;
>> - /* service_tree member */
>> - struct rb_node rb_node;
>> - /* service_tree key */
>> - unsigned long rb_key;
>> /* prio tree member */
>> struct rb_node p_node;
>> /* prio tree root we belong to, if any */
>> @@ -143,7 +154,6 @@ struct cfq_queue {
>> u32 seek_history;
>> sector_t last_request_pos;
>>
>> - struct cfq_rb_root *service_tree;
>> struct cfq_queue *new_cfqq;
>> struct cfq_group *cfqg;
>> struct cfq_group *orig_cfqg;
>> @@ -302,6 +312,15 @@ struct cfq_data {
>> struct rcu_head rcu;
>> };
>>
>> +static inline struct cfq_queue *
>> +cfqq_of_entity(struct cfq_entity *cfqe)
>> +{
>> + if (cfqe)
>> + return container_of(cfqe, struct cfq_queue,
>> + cfqe);
>> + return NULL;
>> +}
>> +
>> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>>
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> @@ -743,7 +762,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
>> /*
>> * The below is leftmost cache rbtree addon
>> */
>> -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>> +static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> {
>> /* Service tree is empty */
>> if (!root->count)
>> @@ -753,7 +772,7 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
>> root->left = rb_first(&root->rb);
>>
>> if (root->left)
>> - return rb_entry(root->left, struct cfq_queue, rb_node);
>> + return rb_entry(root->left, struct cfq_entity, rb_node);
>>
>> return NULL;
>> }
>> @@ -1170,21 +1189,24 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>> static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> bool add_front)
>> {
>> + struct cfq_entity *cfqe;
>> struct rb_node **p, *parent;
>> - struct cfq_queue *__cfqq;
>> + struct cfq_entity *__cfqe;
>> unsigned long rb_key;
>> struct cfq_rb_root *service_tree;
>> int left;
>> int new_cfqq = 1;
>> int group_changed = 0;
>>
>> + cfqe = &cfqq->cfqe;
>> +
>> #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> if (!cfqd->cfq_group_isolation
>> && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
>> && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>> /* Move this cfq to root group */
>> cfq_log_cfqq(cfqd, cfqq, "moving to root group");
>> - if (!RB_EMPTY_NODE(&cfqq->rb_node))
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> cfqq->orig_cfqg = cfqq->cfqg;
>> cfqq->cfqg = &cfqd->root_group;
>> @@ -1194,7 +1216,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>> /* cfqq is sequential now needs to go to its original group */
>> BUG_ON(cfqq->cfqg != &cfqd->root_group);
>> - if (!RB_EMPTY_NODE(&cfqq->rb_node))
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> cfq_put_cfqg(cfqq->cfqg);
>> cfqq->cfqg = cfqq->orig_cfqg;
>> @@ -1209,9 +1231,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> if (cfq_class_idle(cfqq)) {
>> rb_key = CFQ_IDLE_DELAY;
>> parent = rb_last(&service_tree->rb);
>> - if (parent && parent != &cfqq->rb_node) {
>> - __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
>> - rb_key += __cfqq->rb_key;
>> + if (parent && parent != &cfqe->rb_node) {
>> + __cfqe = rb_entry(parent,
>> + struct cfq_entity,
>> + rb_node);
>
> Above can fit into a single line or at max two lines?
I replaced io_sched_entity into cfq_entity automatically, but forgot to
consider to line character number. :(
Will change all.
Thanks,
Gui
>
>> + rb_key += __cfqe->rb_key;
>> } else
>> rb_key += jiffies;
>> } else if (!add_front) {
>> @@ -1226,37 +1250,39 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> cfqq->slice_resid = 0;
>> } else {
>> rb_key = -HZ;
>> - __cfqq = cfq_rb_first(service_tree);
>> - rb_key += __cfqq ? __cfqq->rb_key : jiffies;
>> + __cfqe = cfq_rb_first(service_tree);
>> + rb_key += __cfqe ? __cfqe->rb_key : jiffies;
>> }
>>
>> - if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> new_cfqq = 0;
>> /*
>> * same position, nothing more to do
>> */
>> - if (rb_key == cfqq->rb_key &&
>> - cfqq->service_tree == service_tree)
>> + if (rb_key == cfqe->rb_key &&
>> + cfqe->service_tree == service_tree)
>> return;
>>
>> - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
>> - cfqq->service_tree = NULL;
>> + cfq_rb_erase(&cfqe->rb_node,
>> + cfqe->service_tree);
>
> Above can fit on single line?
>
>> + cfqe->service_tree = NULL;
>> }
>>
>> left = 1;
>> parent = NULL;
>> - cfqq->service_tree = service_tree;
>> + cfqe->service_tree = service_tree;
>> p = &service_tree->rb.rb_node;
>> while (*p) {
>> struct rb_node **n;
>>
>> parent = *p;
>> - __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
>> + __cfqe = rb_entry(parent, struct cfq_entity,
>> + rb_node);
>
> Single line.
>
>>
>> /*
>> * sort by key, that represents service time.
>> */
>> - if (time_before(rb_key, __cfqq->rb_key))
>> + if (time_before(rb_key, __cfqe->rb_key))
>> n = &(*p)->rb_left;
>> else {
>> n = &(*p)->rb_right;
>> @@ -1267,11 +1293,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> }
>>
>> if (left)
>> - service_tree->left = &cfqq->rb_node;
>> + service_tree->left = &cfqe->rb_node;
>>
>> - cfqq->rb_key = rb_key;
>> - rb_link_node(&cfqq->rb_node, parent, p);
>> - rb_insert_color(&cfqq->rb_node, &service_tree->rb);
>> + cfqe->rb_key = rb_key;
>> + rb_link_node(&cfqe->rb_node, parent, p);
>> + rb_insert_color(&cfqe->rb_node, &service_tree->rb);
>> service_tree->count++;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> @@ -1373,13 +1399,17 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> */
>> static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> {
>> + struct cfq_entity *cfqe;
>> cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>> BUG_ON(!cfq_cfqq_on_rr(cfqq));
>> cfq_clear_cfqq_on_rr(cfqq);
>>
>> - if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
>> - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
>> - cfqq->service_tree = NULL;
>> + cfqe = &cfqq->cfqe;
>> +
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> + cfq_rb_erase(&cfqe->rb_node,
>> + cfqe->service_tree);
>
> Single line above.
>
>
>> + cfqe->service_tree = NULL;
>> }
>> if (cfqq->p_root) {
>> rb_erase(&cfqq->p_node, cfqq->p_root);
>> @@ -1707,13 +1737,13 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return NULL;
>> if (RB_EMPTY_ROOT(&service_tree->rb))
>> return NULL;
>> - return cfq_rb_first(service_tree);
>> + return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> {
>> struct cfq_group *cfqg;
>> - struct cfq_queue *cfqq;
>> + struct cfq_entity *cfqe;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> @@ -1724,9 +1754,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> if (!cfqg)
>> return NULL;
>>
>> - for_each_cfqg_st(cfqg, i, j, st)
>> - if ((cfqq = cfq_rb_first(st)) != NULL)
>> - return cfqq;
>> + for_each_cfqg_st(cfqg, i, j, st) {
>> + cfqe = cfq_rb_first(st);
>> + if (cfqe != NULL)
>> + return cfqq_of_entity(cfqe);
>> + }
>> return NULL;
>> }
>>
>> @@ -1863,9 +1895,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
>>
>> static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> {
>> + struct cfq_entity *cfqe;
>> enum wl_prio_t prio = cfqq_prio(cfqq);
>> - struct cfq_rb_root *service_tree = cfqq->service_tree;
>> + struct cfq_rb_root *service_tree;
>>
>> + cfqe = &cfqq->cfqe;
>> + service_tree = cfqe->service_tree;
>> BUG_ON(!service_tree);
>> BUG_ON(!service_tree->count);
>>
>> @@ -2075,7 +2110,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> - struct cfq_queue *queue;
>> + struct cfq_entity *cfqe;
>> int i;
>> bool key_valid = false;
>> unsigned long lowest_key = 0;
>> @@ -2083,10 +2118,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>>
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> /* select the one with lowest rb_key */
>> - queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - if (queue &&
>> - (!key_valid || time_before(queue->rb_key, lowest_key))) {
>> - lowest_key = queue->rb_key;
>> + cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> + if (cfqe &&
>> + (!key_valid ||
>> + time_before(cfqe->rb_key, lowest_key))) {
>
> Merge two lines into one above.
>
>> + lowest_key = cfqe->rb_key;
>> cur_best = i;
>> key_valid = true;
>> }
>> @@ -2834,7 +2870,10 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
>> static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> pid_t pid, bool is_sync)
>> {
>> - RB_CLEAR_NODE(&cfqq->rb_node);
>> + struct cfq_entity *cfqe;
>> +
>> + cfqe = &cfqq->cfqe;
>> + RB_CLEAR_NODE(&cfqe->rb_node);
>> RB_CLEAR_NODE(&cfqq->p_node);
>> INIT_LIST_HEAD(&cfqq->fifo);
>>
>> @@ -3243,7 +3282,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
>> /* Allow preemption only if we are idling on sync-noidle tree */
>> if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
>> cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
>> - new_cfqq->service_tree->count == 2 &&
>> + new_cfqq->cfqe.service_tree->count == 2 &&
>> RB_EMPTY_ROOT(&cfqq->sort_list))
>> return true;
>>
>
> Apart from above minor nits, this patch looks good to me.
>
> Acked-by: Vivek Goyal <vgoyal@redhat.com>
>
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Regards
Gui Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
2010-12-13 1:44 ` [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
@ 2010-12-13 1:44 ` Gui Jianfeng
2010-12-13 16:59 ` Vivek Goyal
2010-12-13 1:44 ` [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
` (5 subsequent siblings)
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:44 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Introduce cfq_entity for CFQ group
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/cfq-iosched.c | 113 ++++++++++++++++++++++++++++++--------------------
1 files changed, 68 insertions(+), 45 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9b07a24..91e9833 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1,5 +1,5 @@
/*
- * CFQ, or complete fairness queueing, disk scheduler.
+ * Cfq, or complete fairness queueing, disk scheduler.
*
* Based on ideas from a previously unfinished io
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
@@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
-#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+#define rb_entry_entity(node) rb_entry((node), struct cfq_entity,\
+ rb_node)
/*
* Most of our rbtree usage is for sorting with min extraction, so
@@ -102,6 +103,11 @@ struct cfq_entity {
struct rb_node rb_node;
/* service_tree key, represent the position on the tree */
unsigned long rb_key;
+
+ /* group service_tree key */
+ u64 vdisktime;
+ bool is_group_entity;
+ unsigned int weight;
};
/*
@@ -183,12 +189,8 @@ enum wl_type_t {
/* This is per cgroup per device grouping structure */
struct cfq_group {
- /* group service_tree member */
- struct rb_node rb_node;
-
- /* group service_tree key */
- u64 vdisktime;
- unsigned int weight;
+ /* cfq group sched entity */
+ struct cfq_entity cfqe;
/* number of cfqq currently on this group */
int nr_cfqq;
@@ -315,12 +317,21 @@ struct cfq_data {
static inline struct cfq_queue *
cfqq_of_entity(struct cfq_entity *cfqe)
{
- if (cfqe)
+ if (cfqe && !cfqe->is_group_entity)
return container_of(cfqe, struct cfq_queue,
cfqe);
return NULL;
}
+static inline struct cfq_group *
+cfqg_of_entity(struct cfq_entity *cfqe)
+{
+ if (cfqe && cfqe->is_group_entity)
+ return container_of(cfqe, struct cfq_group,
+ cfqe);
+ return NULL;
+}
+
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
@@ -548,12 +559,12 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_entity *cfqe)
{
u64 d = delta << CFQ_SERVICE_SHIFT;
d = d * BLKIO_WEIGHT_DEFAULT;
- do_div(d, cfqg->weight);
+ do_div(d, cfqe->weight);
return d;
}
@@ -578,11 +589,11 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
static void update_min_vdisktime(struct cfq_rb_root *st)
{
u64 vdisktime = st->min_vdisktime;
- struct cfq_group *cfqg;
+ struct cfq_entity *cfqe;
if (st->left) {
- cfqg = rb_entry_cfqg(st->left);
- vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+ cfqe = rb_entry_entity(st->left);
+ vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime);
}
st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
@@ -613,8 +624,9 @@ static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_entity *cfqe = &cfqg->cfqe;
- return cfq_target_latency * cfqg->weight / st->total_weight;
+ return cfq_target_latency * cfqe->weight / st->total_weight;
}
static inline void
@@ -777,13 +789,13 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
{
if (!root->left)
root->left = rb_first(&root->rb);
if (root->left)
- return rb_entry_cfqg(root->left);
+ return rb_entry_entity(root->left);
return NULL;
}
@@ -840,9 +852,9 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
}
static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
{
- return cfqg->vdisktime - st->min_vdisktime;
+ return entity->vdisktime - st->min_vdisktime;
}
static void
@@ -850,15 +862,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
struct rb_node **node = &st->rb.rb_node;
struct rb_node *parent = NULL;
- struct cfq_group *__cfqg;
- s64 key = cfqg_key(st, cfqg);
+ struct cfq_entity *__cfqe;
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+ s64 key = entity_key(st, cfqe);
int left = 1;
while (*node != NULL) {
parent = *node;
- __cfqg = rb_entry_cfqg(parent);
+ __cfqe = rb_entry_entity(parent);
- if (key < cfqg_key(st, __cfqg))
+ if (key < entity_key(st, __cfqe))
node = &parent->rb_left;
else {
node = &parent->rb_right;
@@ -867,21 +880,22 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
}
if (left)
- st->left = &cfqg->rb_node;
+ st->left = &cfqe->rb_node;
- rb_link_node(&cfqg->rb_node, parent, node);
- rb_insert_color(&cfqg->rb_node, &st->rb);
+ rb_link_node(&cfqe->rb_node, parent, node);
+ rb_insert_color(&cfqe->rb_node, &st->rb);
}
static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
- struct cfq_group *__cfqg;
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_entity *__cfqe;
struct rb_node *n;
cfqg->nr_cfqq++;
- if (!RB_EMPTY_NODE(&cfqg->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;
/*
@@ -891,19 +905,20 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
*/
n = rb_last(&st->rb);
if (n) {
- __cfqg = rb_entry_cfqg(n);
- cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+ __cfqe = rb_entry_entity(n);
+ cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
} else
- cfqg->vdisktime = st->min_vdisktime;
+ cfqe->vdisktime = st->min_vdisktime;
__cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqg->weight;
+ st->total_weight += cfqe->weight;
}
static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_entity *cfqe = &cfqg->cfqe;
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
@@ -913,9 +928,9 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
return;
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- st->total_weight -= cfqg->weight;
- if (!RB_EMPTY_NODE(&cfqg->rb_node))
- cfq_rb_erase(&cfqg->rb_node, st);
+ st->total_weight -= cfqe->weight;
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ cfq_rb_erase(&cfqe->rb_node, st);
cfqg->saved_workload_slice = 0;
cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
@@ -953,6 +968,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
+ struct cfq_entity *cfqe = &cfqg->cfqe;
BUG_ON(nr_sync < 0);
used_sl = charge = cfq_cfqq_slice_usage(cfqq);
@@ -963,8 +979,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */
- cfq_rb_erase(&cfqg->rb_node, st);
- cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
+ cfq_rb_erase(&cfqe->rb_node, st);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
__cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
@@ -976,8 +992,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
} else
cfqg->saved_workload_slice = 0;
- cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
- st->min_vdisktime);
+ cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
+ cfqe->vdisktime, st->min_vdisktime);
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -996,7 +1012,7 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
unsigned int weight)
{
- cfqg_of_blkg(blkg)->weight = weight;
+ cfqg_of_blkg(blkg)->cfqe.weight = weight;
}
static struct cfq_group *
@@ -1025,7 +1041,9 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
+ RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
+
+ cfqg->cfqe.is_group_entity = true;
/*
* Take the initial reference that will be released on destroy
@@ -1049,7 +1067,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
0);
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -2216,10 +2234,13 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_group *cfqg;
+ struct cfq_entity *cfqe;
if (RB_EMPTY_ROOT(&st->rb))
return NULL;
- cfqg = cfq_rb_first_group(st);
+ cfqe = cfq_rb_first_entity(st);
+ cfqg = cfqg_of_entity(cfqe);
+ BUG_ON(!cfqg);
update_min_vdisktime(st);
return cfqg;
}
@@ -2877,6 +2898,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
RB_CLEAR_NODE(&cfqq->p_node);
INIT_LIST_HEAD(&cfqq->fifo);
+ cfqe->is_group_entity = false;
atomic_set(&cfqq->ref, 0);
cfqq->cfqd = cfqd;
@@ -3909,10 +3931,11 @@ static void *cfq_init_queue(struct request_queue *q)
cfqg = &cfqd->root_group;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
+ RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
/* Give preference to root group over other groups */
- cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+ cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
+ cfqg->cfqe.is_group_entity = true;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group
2010-12-13 1:44 ` [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
@ 2010-12-13 16:59 ` Vivek Goyal
2010-12-14 1:33 ` Gui Jianfeng
2010-12-14 1:47 ` Gui Jianfeng
0 siblings, 2 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-13 16:59 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:44:33AM +0800, Gui Jianfeng wrote:
> Introduce cfq_entity for CFQ group
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/cfq-iosched.c | 113 ++++++++++++++++++++++++++++++--------------------
> 1 files changed, 68 insertions(+), 45 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 9b07a24..91e9833 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -1,5 +1,5 @@
> /*
> - * CFQ, or complete fairness queueing, disk scheduler.
> + * Cfq, or complete fairness queueing, disk scheduler.
Is this really required?
> *
> * Based on ideas from a previously unfinished io
> * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>
> #define sample_valid(samples) ((samples) > 80)
> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
> +#define rb_entry_entity(node) rb_entry((node), struct cfq_entity,\
> + rb_node)
>
> /*
> * Most of our rbtree usage is for sorting with min extraction, so
> @@ -102,6 +103,11 @@ struct cfq_entity {
> struct rb_node rb_node;
> /* service_tree key, represent the position on the tree */
> unsigned long rb_key;
> +
> + /* group service_tree key */
> + u64 vdisktime;
> + bool is_group_entity;
> + unsigned int weight;
> };
>
> /*
> @@ -183,12 +189,8 @@ enum wl_type_t {
>
> /* This is per cgroup per device grouping structure */
> struct cfq_group {
> - /* group service_tree member */
> - struct rb_node rb_node;
> -
> - /* group service_tree key */
> - u64 vdisktime;
> - unsigned int weight;
> + /* cfq group sched entity */
> + struct cfq_entity cfqe;
>
> /* number of cfqq currently on this group */
> int nr_cfqq;
> @@ -315,12 +317,21 @@ struct cfq_data {
> static inline struct cfq_queue *
> cfqq_of_entity(struct cfq_entity *cfqe)
> {
> - if (cfqe)
> + if (cfqe && !cfqe->is_group_entity)
> return container_of(cfqe, struct cfq_queue,
> cfqe);
can be single line above. I think came from previous patch.
> return NULL;
> }
>
> +static inline struct cfq_group *
> +cfqg_of_entity(struct cfq_entity *cfqe)
> +{
> + if (cfqe && cfqe->is_group_entity)
> + return container_of(cfqe, struct cfq_group,
> + cfqe);
No need to split line.
> + return NULL;
> +}
> +
> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>
> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> @@ -548,12 +559,12 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
> }
>
> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
> +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_entity *cfqe)
> {
> u64 d = delta << CFQ_SERVICE_SHIFT;
>
> d = d * BLKIO_WEIGHT_DEFAULT;
> - do_div(d, cfqg->weight);
> + do_div(d, cfqe->weight);
> return d;
> }
>
> @@ -578,11 +589,11 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
> static void update_min_vdisktime(struct cfq_rb_root *st)
> {
> u64 vdisktime = st->min_vdisktime;
> - struct cfq_group *cfqg;
> + struct cfq_entity *cfqe;
>
> if (st->left) {
> - cfqg = rb_entry_cfqg(st->left);
> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
> + cfqe = rb_entry_entity(st->left);
> + vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime);
> }
>
> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
> @@ -613,8 +624,9 @@ static inline unsigned
> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> struct cfq_rb_root *st = &cfqd->grp_service_tree;
> + struct cfq_entity *cfqe = &cfqg->cfqe;
>
> - return cfq_target_latency * cfqg->weight / st->total_weight;
> + return cfq_target_latency * cfqe->weight / st->total_weight;
> }
>
> static inline void
> @@ -777,13 +789,13 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> return NULL;
> }
>
> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
> +static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
So now we have two functions. One cfq_rb_first() and one cfq_rb_first_entity()
both returning cfq_entity*? This is confusing. Or you are getting rid of
one in later patches. Why not make use of existing cfq_rb_first()?
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group
2010-12-13 16:59 ` Vivek Goyal
@ 2010-12-14 1:33 ` Gui Jianfeng
2010-12-14 1:47 ` Gui Jianfeng
1 sibling, 0 replies; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 1:33 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:44:33AM +0800, Gui Jianfeng wrote:
>> Introduce cfq_entity for CFQ group
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/cfq-iosched.c | 113 ++++++++++++++++++++++++++++++--------------------
>> 1 files changed, 68 insertions(+), 45 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 9b07a24..91e9833 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -1,5 +1,5 @@
>> /*
>> - * CFQ, or complete fairness queueing, disk scheduler.
>> + * Cfq, or complete fairness queueing, disk scheduler.
>
> Is this really required?
>
>> *
>> * Based on ideas from a previously unfinished io
>> * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
>> @@ -73,7 +73,8 @@ static DEFINE_IDA(cic_index_ida);
>> #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
>>
>> #define sample_valid(samples) ((samples) > 80)
>> -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
>> +#define rb_entry_entity(node) rb_entry((node), struct cfq_entity,\
>> + rb_node)
>>
>> /*
>> * Most of our rbtree usage is for sorting with min extraction, so
>> @@ -102,6 +103,11 @@ struct cfq_entity {
>> struct rb_node rb_node;
>> /* service_tree key, represent the position on the tree */
>> unsigned long rb_key;
>> +
>> + /* group service_tree key */
>> + u64 vdisktime;
>> + bool is_group_entity;
>> + unsigned int weight;
>> };
>>
>> /*
>> @@ -183,12 +189,8 @@ enum wl_type_t {
>>
>> /* This is per cgroup per device grouping structure */
>> struct cfq_group {
>> - /* group service_tree member */
>> - struct rb_node rb_node;
>> -
>> - /* group service_tree key */
>> - u64 vdisktime;
>> - unsigned int weight;
>> + /* cfq group sched entity */
>> + struct cfq_entity cfqe;
>>
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>> @@ -315,12 +317,21 @@ struct cfq_data {
>> static inline struct cfq_queue *
>> cfqq_of_entity(struct cfq_entity *cfqe)
>> {
>> - if (cfqe)
>> + if (cfqe && !cfqe->is_group_entity)
>> return container_of(cfqe, struct cfq_queue,
>> cfqe);
>
> can be single line above. I think came from previous patch.
>
>> return NULL;
>> }
>>
>> +static inline struct cfq_group *
>> +cfqg_of_entity(struct cfq_entity *cfqe)
>> +{
>> + if (cfqe && cfqe->is_group_entity)
>> + return container_of(cfqe, struct cfq_group,
>> + cfqe);
>
> No need to split line.
>
>> + return NULL;
>> +}
>> +
>> static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>>
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> @@ -548,12 +559,12 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
>> }
>>
>> -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
>> +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_entity *cfqe)
>> {
>> u64 d = delta << CFQ_SERVICE_SHIFT;
>>
>> d = d * BLKIO_WEIGHT_DEFAULT;
>> - do_div(d, cfqg->weight);
>> + do_div(d, cfqe->weight);
>> return d;
>> }
>>
>> @@ -578,11 +589,11 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
>> static void update_min_vdisktime(struct cfq_rb_root *st)
>> {
>> u64 vdisktime = st->min_vdisktime;
>> - struct cfq_group *cfqg;
>> + struct cfq_entity *cfqe;
>>
>> if (st->left) {
>> - cfqg = rb_entry_cfqg(st->left);
>> - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
>> + cfqe = rb_entry_entity(st->left);
>> + vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime);
>> }
>>
>> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
>> @@ -613,8 +624,9 @@ static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> + struct cfq_entity *cfqe = &cfqg->cfqe;
>>
>> - return cfq_target_latency * cfqg->weight / st->total_weight;
>> + return cfq_target_latency * cfqe->weight / st->total_weight;
>> }
>>
>> static inline void
>> @@ -777,13 +789,13 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
>> +static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>
> So now we have two functions. One cfq_rb_first() and one cfq_rb_first_entity()
> both returning cfq_entity*? This is confusing. Or you are getting rid of
> one in later patches. Why not make use of existing cfq_rb_first()?
Yes, I get rid of cfq_rb_first_entity() in later patch.
Thanks,
Gui
>
> Thanks
> Vivek
>
--
Regards
Gui Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group
2010-12-13 16:59 ` Vivek Goyal
2010-12-14 1:33 ` Gui Jianfeng
@ 2010-12-14 1:47 ` Gui Jianfeng
1 sibling, 0 replies; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 1:47 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:44:33AM +0800, Gui Jianfeng wrote:
>> Introduce cfq_entity for CFQ group
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/cfq-iosched.c | 113 ++++++++++++++++++++++++++++++--------------------
>> 1 files changed, 68 insertions(+), 45 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 9b07a24..91e9833 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -1,5 +1,5 @@
>> /*
>> - * CFQ, or complete fairness queueing, disk scheduler.
>> + * Cfq, or complete fairness queueing, disk scheduler.
>
> Is this really required?
Strange... I don't rember I do this change. Must be some misoperation.
Gui
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
2010-12-13 1:44 ` [PATCH 1/8 v2] cfq-iosched: Introduce cfq_entity for CFQ queue Gui Jianfeng
2010-12-13 1:44 ` [PATCH 2/8 v2] cfq-iosched: Introduce cfq_entity for CFQ group Gui Jianfeng
@ 2010-12-13 1:44 ` Gui Jianfeng
2010-12-13 16:59 ` Vivek Goyal
2010-12-13 1:44 ` [PATCH 4/8 v2] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
` (4 subsequent siblings)
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:44 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
use the same scheduling algorithm as CFQ group does. But it still give newly added
cfqq a small vdisktime jump according to its ioprio. This patch will help to make
CFQ queue and group schedule on a same service tree.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/cfq-iosched.c | 196 ++++++++++++++++++++++++++++++++++++---------------
1 files changed, 139 insertions(+), 57 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 91e9833..30d19c0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -101,10 +101,7 @@ struct cfq_entity {
struct cfq_rb_root *service_tree;
/* service_tree member */
struct rb_node rb_node;
- /* service_tree key, represent the position on the tree */
- unsigned long rb_key;
-
- /* group service_tree key */
+ /* service_tree key */
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
@@ -116,6 +113,8 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
+ /* Reposition time */
+ unsigned long reposition_time;
/* reference count */
atomic_t ref;
/* various state flags, see below */
@@ -314,6 +313,22 @@ struct cfq_data {
struct rcu_head rcu;
};
+/*
+ * Map io priority(7 ~ 0) to io weight(100 ~ 1000)
+ */
+static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
+{
+ unsigned int step;
+
+ BUG_ON(ioprio >= IOPRIO_BE_NR);
+
+ step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
+ if (ioprio == 0)
+ return BLKIO_WEIGHT_MAX;
+
+ return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
+}
+
static inline struct cfq_queue *
cfqq_of_entity(struct cfq_entity *cfqe)
{
@@ -841,16 +856,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
}
-static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
- struct cfq_queue *cfqq)
-{
- /*
- * just an approximation, should be ok.
- */
- return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
- cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
static inline s64
entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
{
@@ -1199,6 +1204,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
#endif /* GROUP_IOSCHED */
+static inline u64 cfq_get_boost(struct cfq_data *cfqd,
+ struct cfq_entity *cfqe)
+{
+ u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
+
+ d = d * BLKIO_WEIGHT_DEFAULT;
+ do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
+ return d;
+}
+
/*
* The cfqd->service_trees holds all pending cfq_queue's that have
* requests waiting to be processed. It is sorted in the order that
@@ -1210,13 +1225,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_entity *cfqe;
struct rb_node **p, *parent;
struct cfq_entity *__cfqe;
- unsigned long rb_key;
- struct cfq_rb_root *service_tree;
+ struct cfq_rb_root *service_tree, *orig_st;
int left;
int new_cfqq = 1;
int group_changed = 0;
+ s64 key;
cfqe = &cfqq->cfqe;
+ orig_st = cfqe->service_tree;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
if (!cfqd->cfq_group_isolation
@@ -1224,8 +1240,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
/* Move this cfq to root group */
cfq_log_cfqq(cfqd, cfqq, "moving to root group");
- if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ /*
+ * Group changed, dequeue this CFQ queue from the
+ * original service tree.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+ }
cfqq->orig_cfqg = cfqq->cfqg;
cfqq->cfqg = &cfqd->root_group;
atomic_inc(&cfqd->root_group.ref);
@@ -1234,8 +1257,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
/* cfqq is sequential now needs to go to its original group */
BUG_ON(cfqq->cfqg != &cfqd->root_group);
- if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ /*
+ * Group changed, dequeue this CFQ queue from the
+ * original service tree.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+ }
cfq_put_cfqg(cfqq->cfqg);
cfqq->cfqg = cfqq->orig_cfqg;
cfqq->orig_cfqg = NULL;
@@ -1246,50 +1276,73 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
cfqq_type(cfqq));
+ /*
+ * For the time being, put the newly added CFQ queue at the end of the
+ * service tree.
+ */
+ if (RB_EMPTY_NODE(&cfqe->rb_node)) {
+ /*
+ * If this CFQ queue moves to another group, the original
+ * vdisktime makes no sense any more, reset the vdisktime
+ * here.
+ */
+ parent = rb_last(&service_tree->rb);
+ if (parent) {
+ u64 boost;
+ s64 __vdisktime;
+
+ __cfqe = rb_entry_entity(parent);
+ cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
+
+ /* Give some vdisktime boost according to its weight */
+ boost = cfq_get_boost(cfqd, cfqe);
+ __vdisktime = cfqe->vdisktime - boost;
+ if (__vdisktime)
+ cfqe->vdisktime = __vdisktime;
+ else
+ cfqe->vdisktime = 0;
+ } else
+ cfqe->vdisktime = service_tree->min_vdisktime;
+
+ goto insert;
+ }
+ /*
+ * Ok, we get here, this CFQ queue is on the service tree, dequeue it
+ * firstly.
+ */
+ cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
+ orig_st->total_weight -= cfqe->weight;
+
+ new_cfqq = 0;
+
if (cfq_class_idle(cfqq)) {
- rb_key = CFQ_IDLE_DELAY;
parent = rb_last(&service_tree->rb);
if (parent && parent != &cfqe->rb_node) {
__cfqe = rb_entry(parent,
- struct cfq_entity,
- rb_node);
- rb_key += __cfqe->rb_key;
+ struct cfq_entity,
+ rb_node);
+ cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
} else
- rb_key += jiffies;
+ cfqe->vdisktime = service_tree->min_vdisktime;
} else if (!add_front) {
/*
- * Get our rb key offset. Subtract any residual slice
- * value carried from last service. A negative resid
- * count indicates slice overrun, and this should position
- * the next service time further away in the tree.
+ * We charge the CFQ queue by the time this queue runs, and
+ * repsition it on the service tree.
*/
- rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
- rb_key -= cfqq->slice_resid;
- cfqq->slice_resid = 0;
- } else {
- rb_key = -HZ;
- __cfqe = cfq_rb_first(service_tree);
- rb_key += __cfqe ? __cfqe->rb_key : jiffies;
- }
+ unsigned int used_sl;
- if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
- new_cfqq = 0;
- /*
- * same position, nothing more to do
- */
- if (rb_key == cfqe->rb_key &&
- cfqe->service_tree == service_tree)
- return;
-
- cfq_rb_erase(&cfqe->rb_node,
- cfqe->service_tree);
- cfqe->service_tree = NULL;
+ used_sl = cfq_cfqq_slice_usage(cfqq);
+ cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
+ } else {
+ cfqe->vdisktime = service_tree->min_vdisktime;
}
+insert:
left = 1;
parent = NULL;
cfqe->service_tree = service_tree;
p = &service_tree->rb.rb_node;
+ key = entity_key(service_tree, cfqe);
while (*p) {
struct rb_node **n;
@@ -1300,7 +1353,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
/*
* sort by key, that represents service time.
*/
- if (time_before(rb_key, __cfqe->rb_key))
+ if (key < entity_key(service_tree, __cfqe))
n = &(*p)->rb_left;
else {
n = &(*p)->rb_right;
@@ -1313,10 +1366,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (left)
service_tree->left = &cfqe->rb_node;
- cfqe->rb_key = rb_key;
rb_link_node(&cfqe->rb_node, parent, p);
rb_insert_color(&cfqe->rb_node, &service_tree->rb);
+ update_min_vdisktime(service_tree);
service_tree->count++;
+ service_tree->total_weight += cfqe->weight;
+ cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1418,15 +1473,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
struct cfq_entity *cfqe;
+ struct cfq_rb_root *service_tree;
+
cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);
cfqe = &cfqq->cfqe;
+ service_tree = cfqe->service_tree;
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
cfq_rb_erase(&cfqe->rb_node,
cfqe->service_tree);
+ service_tree->total_weight -= cfqe->weight;
cfqe->service_tree = NULL;
}
if (cfqq->p_root) {
@@ -2125,24 +2184,34 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
}
}
+/*
+ * The time when a CFQ queue is put onto a service tree is recoreded in
+ * cfqq->reposition_time. Currently, we check the first priority CFQ queues
+ * on each service tree, and select the workload type that contain the lowest
+ * reposition_time CFQ queue among them.
+ */
static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
struct cfq_entity *cfqe;
+ struct cfq_queue *cfqq;
+ unsigned long lowest_start_time;
int i;
- bool key_valid = false;
- unsigned long lowest_key = 0;
+ bool time_valid = false;
enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
+ /*
+ * TODO: We may take io priority into account when choosing a workload
+ * type. But for the time being just make use of reposition_time only.
+ */
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
- /* select the one with lowest rb_key */
cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
- if (cfqe &&
- (!key_valid ||
- time_before(cfqe->rb_key, lowest_key))) {
- lowest_key = cfqe->rb_key;
+ cfqq = cfqq_of_entity(cfqe);
+ if (cfqe && (!time_valid ||
+ cfqq->reposition_time < lowest_start_time)) {
+ lowest_start_time = cfqq->reposition_time;
cur_best = i;
- key_valid = true;
+ time_valid = true;
}
}
@@ -2814,10 +2883,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
{
struct task_struct *tsk = current;
int ioprio_class;
+ struct cfq_entity *cfqe;
if (!cfq_cfqq_prio_changed(cfqq))
return;
+ cfqe = &cfqq->cfqe;
+
ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
switch (ioprio_class) {
default:
@@ -2844,6 +2916,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
break;
}
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
+
/*
* keep track of original prio settings in case we have to temporarily
* elevate the priority of this queue
@@ -3578,6 +3652,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
*/
static void cfq_prio_boost(struct cfq_queue *cfqq)
{
+ struct cfq_entity *cfqe;
+
+ cfqe = &cfqq->cfqe;
if (has_fs_excl()) {
/*
* boost idle prio on transactions that would lock out other
@@ -3594,6 +3671,11 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
cfqq->ioprio_class = cfqq->org_ioprio_class;
cfqq->ioprio = cfqq->org_ioprio;
}
+
+ /*
+ * update the io weight if io priority gets changed.
+ */
+ cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
}
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
2010-12-13 1:44 ` [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2010-12-13 16:59 ` Vivek Goyal
2010-12-14 2:41 ` Gui Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Vivek Goyal @ 2010-12-13 16:59 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:44:45AM +0800, Gui Jianfeng wrote:
> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
> use the same scheduling algorithm as CFQ group does. But it still give newly added
> cfqq a small vdisktime jump according to its ioprio. This patch will help to make
> CFQ queue and group schedule on a same service tree.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/cfq-iosched.c | 196 ++++++++++++++++++++++++++++++++++++---------------
> 1 files changed, 139 insertions(+), 57 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 91e9833..30d19c0 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -101,10 +101,7 @@ struct cfq_entity {
> struct cfq_rb_root *service_tree;
> /* service_tree member */
> struct rb_node rb_node;
> - /* service_tree key, represent the position on the tree */
> - unsigned long rb_key;
> -
> - /* group service_tree key */
> + /* service_tree key */
> u64 vdisktime;
> bool is_group_entity;
> unsigned int weight;
> @@ -116,6 +113,8 @@ struct cfq_entity {
> struct cfq_queue {
> /* The schedule entity */
> struct cfq_entity cfqe;
> + /* Reposition time */
> + unsigned long reposition_time;
> /* reference count */
> atomic_t ref;
> /* various state flags, see below */
> @@ -314,6 +313,22 @@ struct cfq_data {
> struct rcu_head rcu;
> };
>
> +/*
> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000)
> + */
> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
> +{
> + unsigned int step;
> +
> + BUG_ON(ioprio >= IOPRIO_BE_NR);
> +
> + step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
> + if (ioprio == 0)
> + return BLKIO_WEIGHT_MAX;
> +
> + return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
> +}
> +
> static inline struct cfq_queue *
> cfqq_of_entity(struct cfq_entity *cfqe)
> {
> @@ -841,16 +856,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
> }
>
> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
> - struct cfq_queue *cfqq)
> -{
> - /*
> - * just an approximation, should be ok.
> - */
> - return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
> - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
> -}
> -
> static inline s64
> entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
> {
> @@ -1199,6 +1204,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>
> #endif /* GROUP_IOSCHED */
>
> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
> + struct cfq_entity *cfqe)
> +{
> + u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
> +
> + d = d * BLKIO_WEIGHT_DEFAULT;
> + do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
> + return d;
> +}
> +
> /*
> * The cfqd->service_trees holds all pending cfq_queue's that have
> * requests waiting to be processed. It is sorted in the order that
> @@ -1210,13 +1225,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> struct cfq_entity *cfqe;
> struct rb_node **p, *parent;
> struct cfq_entity *__cfqe;
> - unsigned long rb_key;
> - struct cfq_rb_root *service_tree;
> + struct cfq_rb_root *service_tree, *orig_st;
> int left;
> int new_cfqq = 1;
> int group_changed = 0;
> + s64 key;
>
> cfqe = &cfqq->cfqe;
> + orig_st = cfqe->service_tree;
>
> #ifdef CONFIG_CFQ_GROUP_IOSCHED
> if (!cfqd->cfq_group_isolation
> @@ -1224,8 +1240,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
> /* Move this cfq to root group */
> cfq_log_cfqq(cfqd, cfqq, "moving to root group");
> - if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> + /*
> + * Group changed, dequeue this CFQ queue from the
> + * original service tree.
> + */
> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + orig_st->total_weight -= cfqe->weight;
> + }
> cfqq->orig_cfqg = cfqq->cfqg;
> cfqq->cfqg = &cfqd->root_group;
> atomic_inc(&cfqd->root_group.ref);
> @@ -1234,8 +1257,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
> /* cfqq is sequential now needs to go to its original group */
> BUG_ON(cfqq->cfqg != &cfqd->root_group);
> - if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
> + /*
> + * Group changed, dequeue this CFQ queue from the
> + * original service tree.
> + */
> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + orig_st->total_weight -= cfqe->weight;
> + }
> cfq_put_cfqg(cfqq->cfqg);
> cfqq->cfqg = cfqq->orig_cfqg;
> cfqq->orig_cfqg = NULL;
> @@ -1246,50 +1276,73 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>
> service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
> cfqq_type(cfqq));
> + /*
> + * For the time being, put the newly added CFQ queue at the end of the
> + * service tree.
> + */
> + if (RB_EMPTY_NODE(&cfqe->rb_node)) {
> + /*
> + * If this CFQ queue moves to another group, the original
> + * vdisktime makes no sense any more, reset the vdisktime
> + * here.
> + */
> + parent = rb_last(&service_tree->rb);
> + if (parent) {
> + u64 boost;
> + s64 __vdisktime;
> +
> + __cfqe = rb_entry_entity(parent);
> + cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> +
> + /* Give some vdisktime boost according to its weight */
> + boost = cfq_get_boost(cfqd, cfqe);
> + __vdisktime = cfqe->vdisktime - boost;
> + if (__vdisktime)
> + cfqe->vdisktime = __vdisktime;
> + else
> + cfqe->vdisktime = 0;
After subtraction (boost), __vdisktime can be negative? How do we take
care that it does not go below min_vdisktime. Remember min_vdisktime is
an increasing number.
> + } else
> + cfqe->vdisktime = service_tree->min_vdisktime;
> +
> + goto insert;
> + }
> + /*
> + * Ok, we get here, this CFQ queue is on the service tree, dequeue it
> + * firstly.
> + */
> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> + orig_st->total_weight -= cfqe->weight;
> +
> + new_cfqq = 0;
> +
> if (cfq_class_idle(cfqq)) {
> - rb_key = CFQ_IDLE_DELAY;
> parent = rb_last(&service_tree->rb);
> if (parent && parent != &cfqe->rb_node) {
> __cfqe = rb_entry(parent,
> - struct cfq_entity,
> - rb_node);
> - rb_key += __cfqe->rb_key;
> + struct cfq_entity,
> + rb_node);
> + cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> } else
> - rb_key += jiffies;
> + cfqe->vdisktime = service_tree->min_vdisktime;
> } else if (!add_front) {
> /*
> - * Get our rb key offset. Subtract any residual slice
> - * value carried from last service. A negative resid
> - * count indicates slice overrun, and this should position
> - * the next service time further away in the tree.
> + * We charge the CFQ queue by the time this queue runs, and
> + * repsition it on the service tree.
> */
> - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
> - rb_key -= cfqq->slice_resid;
> - cfqq->slice_resid = 0;
> - } else {
> - rb_key = -HZ;
> - __cfqe = cfq_rb_first(service_tree);
> - rb_key += __cfqe ? __cfqe->rb_key : jiffies;
> - }
> + unsigned int used_sl;
>
> - if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> - new_cfqq = 0;
> - /*
> - * same position, nothing more to do
> - */
> - if (rb_key == cfqe->rb_key &&
> - cfqe->service_tree == service_tree)
> - return;
> -
> - cfq_rb_erase(&cfqe->rb_node,
> - cfqe->service_tree);
> - cfqe->service_tree = NULL;
> + used_sl = cfq_cfqq_slice_usage(cfqq);
> + cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
> + } else {
> + cfqe->vdisktime = service_tree->min_vdisktime;
> }
>
> +insert:
> left = 1;
> parent = NULL;
> cfqe->service_tree = service_tree;
> p = &service_tree->rb.rb_node;
> + key = entity_key(service_tree, cfqe);
> while (*p) {
> struct rb_node **n;
>
> @@ -1300,7 +1353,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> /*
> * sort by key, that represents service time.
> */
> - if (time_before(rb_key, __cfqe->rb_key))
> + if (key < entity_key(service_tree, __cfqe))
> n = &(*p)->rb_left;
> else {
> n = &(*p)->rb_right;
> @@ -1313,10 +1366,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> if (left)
> service_tree->left = &cfqe->rb_node;
>
> - cfqe->rb_key = rb_key;
> rb_link_node(&cfqe->rb_node, parent, p);
> rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> + update_min_vdisktime(service_tree);
> service_tree->count++;
> + service_tree->total_weight += cfqe->weight;
> + cfqq->reposition_time = jiffies;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1418,15 +1473,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> {
> struct cfq_entity *cfqe;
> + struct cfq_rb_root *service_tree;
> +
> cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
> BUG_ON(!cfq_cfqq_on_rr(cfqq));
> cfq_clear_cfqq_on_rr(cfqq);
>
> cfqe = &cfqq->cfqe;
> + service_tree = cfqe->service_tree;
>
> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> cfq_rb_erase(&cfqe->rb_node,
> cfqe->service_tree);
> + service_tree->total_weight -= cfqe->weight;
> cfqe->service_tree = NULL;
> }
> if (cfqq->p_root) {
> @@ -2125,24 +2184,34 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
> }
> }
>
> +/*
> + * The time when a CFQ queue is put onto a service tree is recoreded in
> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
> + * on each service tree, and select the workload type that contain the lowest
> + * reposition_time CFQ queue among them.
> + */
> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> struct cfq_group *cfqg, enum wl_prio_t prio)
> {
> struct cfq_entity *cfqe;
> + struct cfq_queue *cfqq;
> + unsigned long lowest_start_time;
> int i;
> - bool key_valid = false;
> - unsigned long lowest_key = 0;
> + bool time_valid = false;
> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>
> + /*
> + * TODO: We may take io priority into account when choosing a workload
> + * type. But for the time being just make use of reposition_time only.
> + */
> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> - /* select the one with lowest rb_key */
> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - if (cfqe &&
> - (!key_valid ||
> - time_before(cfqe->rb_key, lowest_key))) {
> - lowest_key = cfqe->rb_key;
> + cfqq = cfqq_of_entity(cfqe);
> + if (cfqe && (!time_valid ||
> + cfqq->reposition_time < lowest_start_time)) {
do you need to use here time_before() etc marco to take care of
jifies/reposition_time wrapping?
> + lowest_start_time = cfqq->reposition_time;
> cur_best = i;
> - key_valid = true;
> + time_valid = true;
> }
> }
>
> @@ -2814,10 +2883,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
> {
> struct task_struct *tsk = current;
> int ioprio_class;
> + struct cfq_entity *cfqe;
>
> if (!cfq_cfqq_prio_changed(cfqq))
> return;
>
> + cfqe = &cfqq->cfqe;
> +
> ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
> switch (ioprio_class) {
> default:
> @@ -2844,6 +2916,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
> break;
> }
>
> + cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> +
Same here, can you update cfqe->weight while you are on service tree. In
the past I could not and we had to maintain a serparate variable where
we stored the new weight and once we requeue the entity we used to
process the new weight.
> /*
> * keep track of original prio settings in case we have to temporarily
> * elevate the priority of this queue
> @@ -3578,6 +3652,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
> */
> static void cfq_prio_boost(struct cfq_queue *cfqq)
> {
> + struct cfq_entity *cfqe;
> +
> + cfqe = &cfqq->cfqe;
> if (has_fs_excl()) {
> /*
> * boost idle prio on transactions that would lock out other
> @@ -3594,6 +3671,11 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
> cfqq->ioprio_class = cfqq->org_ioprio_class;
> cfqq->ioprio = cfqq->org_ioprio;
> }
> +
> + /*
> + * update the io weight if io priority gets changed.
> + */
> + cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
How do you know that this cfqe/cfqq is not already on the service tree? I
don't think you can update weights while you are enqueued on the tree.
> }
>
> static inline int __cfq_may_queue(struct cfq_queue *cfqq)
> --
> 1.6.5.2
>
>
>
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
2010-12-13 16:59 ` Vivek Goyal
@ 2010-12-14 2:41 ` Gui Jianfeng
2010-12-14 2:47 ` Vivek Goyal
0 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 2:41 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:44:45AM +0800, Gui Jianfeng wrote:
>> Introduce vdisktime and io weight for CFQ queue scheduling. Currently, io priority
>> maps to a range [100,1000]. It also gets rid of cfq_slice_offset() logic and makes
>> use the same scheduling algorithm as CFQ group does. But it still give newly added
>> cfqq a small vdisktime jump according to its ioprio. This patch will help to make
>> CFQ queue and group schedule on a same service tree.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/cfq-iosched.c | 196 ++++++++++++++++++++++++++++++++++++---------------
>> 1 files changed, 139 insertions(+), 57 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 91e9833..30d19c0 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -101,10 +101,7 @@ struct cfq_entity {
>> struct cfq_rb_root *service_tree;
>> /* service_tree member */
>> struct rb_node rb_node;
>> - /* service_tree key, represent the position on the tree */
>> - unsigned long rb_key;
>> -
>> - /* group service_tree key */
>> + /* service_tree key */
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> @@ -116,6 +113,8 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> /* reference count */
>> atomic_t ref;
>> /* various state flags, see below */
>> @@ -314,6 +313,22 @@ struct cfq_data {
>> struct rcu_head rcu;
>> };
>>
>> +/*
>> + * Map io priority(7 ~ 0) to io weight(100 ~ 1000)
>> + */
>> +static inline unsigned int cfq_prio_to_weight(unsigned short ioprio)
>> +{
>> + unsigned int step;
>> +
>> + BUG_ON(ioprio >= IOPRIO_BE_NR);
>> +
>> + step = (BLKIO_WEIGHT_MAX - BLKIO_WEIGHT_MIN) / (IOPRIO_BE_NR - 1);
>> + if (ioprio == 0)
>> + return BLKIO_WEIGHT_MAX;
>> +
>> + return BLKIO_WEIGHT_MIN + (IOPRIO_BE_NR - ioprio - 1) * step;
>> +}
>> +
>> static inline struct cfq_queue *
>> cfqq_of_entity(struct cfq_entity *cfqe)
>> {
>> @@ -841,16 +856,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
>> }
>>
>> -static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
>> - struct cfq_queue *cfqq)
>> -{
>> - /*
>> - * just an approximation, should be ok.
>> - */
>> - return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
>> - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
>> -}
>> -
>> static inline s64
>> entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
>> {
>> @@ -1199,6 +1204,16 @@ static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
>>
>> #endif /* GROUP_IOSCHED */
>>
>> +static inline u64 cfq_get_boost(struct cfq_data *cfqd,
>> + struct cfq_entity *cfqe)
>> +{
>> + u64 d = cfqd->cfq_slice[1] << CFQ_SERVICE_SHIFT;
>> +
>> + d = d * BLKIO_WEIGHT_DEFAULT;
>> + do_div(d, BLKIO_WEIGHT_MAX - cfqe->weight + BLKIO_WEIGHT_MIN);
>> + return d;
>> +}
>> +
>> /*
>> * The cfqd->service_trees holds all pending cfq_queue's that have
>> * requests waiting to be processed. It is sorted in the order that
>> @@ -1210,13 +1225,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> struct cfq_entity *cfqe;
>> struct rb_node **p, *parent;
>> struct cfq_entity *__cfqe;
>> - unsigned long rb_key;
>> - struct cfq_rb_root *service_tree;
>> + struct cfq_rb_root *service_tree, *orig_st;
>> int left;
>> int new_cfqq = 1;
>> int group_changed = 0;
>> + s64 key;
>>
>> cfqe = &cfqq->cfqe;
>> + orig_st = cfqe->service_tree;
>>
>> #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> if (!cfqd->cfq_group_isolation
>> @@ -1224,8 +1240,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
>> /* Move this cfq to root group */
>> cfq_log_cfqq(cfqd, cfqq, "moving to root group");
>> - if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> + /*
>> + * Group changed, dequeue this CFQ queue from the
>> + * original service tree.
>> + */
>> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> + orig_st->total_weight -= cfqe->weight;
>> + }
>> cfqq->orig_cfqg = cfqq->cfqg;
>> cfqq->cfqg = &cfqd->root_group;
>> atomic_inc(&cfqd->root_group.ref);
>> @@ -1234,8 +1257,15 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
>> /* cfqq is sequential now needs to go to its original group */
>> BUG_ON(cfqq->cfqg != &cfqd->root_group);
>> - if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> cfq_group_service_tree_del(cfqd, cfqq->cfqg);
>> + /*
>> + * Group changed, dequeue this CFQ queue from the
>> + * original service tree.
>> + */
>> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> + orig_st->total_weight -= cfqe->weight;
>> + }
>> cfq_put_cfqg(cfqq->cfqg);
>> cfqq->cfqg = cfqq->orig_cfqg;
>> cfqq->orig_cfqg = NULL;
>> @@ -1246,50 +1276,73 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>>
>> service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
>> cfqq_type(cfqq));
>> + /*
>> + * For the time being, put the newly added CFQ queue at the end of the
>> + * service tree.
>> + */
>> + if (RB_EMPTY_NODE(&cfqe->rb_node)) {
>> + /*
>> + * If this CFQ queue moves to another group, the original
>> + * vdisktime makes no sense any more, reset the vdisktime
>> + * here.
>> + */
>> + parent = rb_last(&service_tree->rb);
>> + if (parent) {
>> + u64 boost;
>> + s64 __vdisktime;
>> +
>> + __cfqe = rb_entry_entity(parent);
>> + cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> +
>> + /* Give some vdisktime boost according to its weight */
>> + boost = cfq_get_boost(cfqd, cfqe);
>> + __vdisktime = cfqe->vdisktime - boost;
>> + if (__vdisktime)
>> + cfqe->vdisktime = __vdisktime;
>> + else
>> + cfqe->vdisktime = 0;
>
> After subtraction (boost), __vdisktime can be negative? How do we take
> care that it does not go below min_vdisktime. Remember min_vdisktime is
> an increasing number.
Will take min_vdisktime in to account.
>
>> + } else
>> + cfqe->vdisktime = service_tree->min_vdisktime;
>> +
>> + goto insert;
>> + }
>> + /*
>> + * Ok, we get here, this CFQ queue is on the service tree, dequeue it
>> + * firstly.
>> + */
>> + cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
>> + orig_st->total_weight -= cfqe->weight;
>> +
>> + new_cfqq = 0;
>> +
>> if (cfq_class_idle(cfqq)) {
>> - rb_key = CFQ_IDLE_DELAY;
>> parent = rb_last(&service_tree->rb);
>> if (parent && parent != &cfqe->rb_node) {
>> __cfqe = rb_entry(parent,
>> - struct cfq_entity,
>> - rb_node);
>> - rb_key += __cfqe->rb_key;
>> + struct cfq_entity,
>> + rb_node);
>> + cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> } else
>> - rb_key += jiffies;
>> + cfqe->vdisktime = service_tree->min_vdisktime;
>> } else if (!add_front) {
>> /*
>> - * Get our rb key offset. Subtract any residual slice
>> - * value carried from last service. A negative resid
>> - * count indicates slice overrun, and this should position
>> - * the next service time further away in the tree.
>> + * We charge the CFQ queue by the time this queue runs, and
>> + * repsition it on the service tree.
>> */
>> - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
>> - rb_key -= cfqq->slice_resid;
>> - cfqq->slice_resid = 0;
>> - } else {
>> - rb_key = -HZ;
>> - __cfqe = cfq_rb_first(service_tree);
>> - rb_key += __cfqe ? __cfqe->rb_key : jiffies;
>> - }
>> + unsigned int used_sl;
>>
>> - if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> - new_cfqq = 0;
>> - /*
>> - * same position, nothing more to do
>> - */
>> - if (rb_key == cfqe->rb_key &&
>> - cfqe->service_tree == service_tree)
>> - return;
>> -
>> - cfq_rb_erase(&cfqe->rb_node,
>> - cfqe->service_tree);
>> - cfqe->service_tree = NULL;
>> + used_sl = cfq_cfqq_slice_usage(cfqq);
>> + cfqe->vdisktime += cfq_scale_slice(used_sl, cfqe);
>> + } else {
>> + cfqe->vdisktime = service_tree->min_vdisktime;
>> }
>>
>> +insert:
>> left = 1;
>> parent = NULL;
>> cfqe->service_tree = service_tree;
>> p = &service_tree->rb.rb_node;
>> + key = entity_key(service_tree, cfqe);
>> while (*p) {
>> struct rb_node **n;
>>
>> @@ -1300,7 +1353,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> /*
>> * sort by key, that represents service time.
>> */
>> - if (time_before(rb_key, __cfqe->rb_key))
>> + if (key < entity_key(service_tree, __cfqe))
>> n = &(*p)->rb_left;
>> else {
>> n = &(*p)->rb_right;
>> @@ -1313,10 +1366,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> if (left)
>> service_tree->left = &cfqe->rb_node;
>>
>> - cfqe->rb_key = rb_key;
>> rb_link_node(&cfqe->rb_node, parent, p);
>> rb_insert_color(&cfqe->rb_node, &service_tree->rb);
>> + update_min_vdisktime(service_tree);
>> service_tree->count++;
>> + service_tree->total_weight += cfqe->weight;
>> + cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1418,15 +1473,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> {
>> struct cfq_entity *cfqe;
>> + struct cfq_rb_root *service_tree;
>> +
>> cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
>> BUG_ON(!cfq_cfqq_on_rr(cfqq));
>> cfq_clear_cfqq_on_rr(cfqq);
>>
>> cfqe = &cfqq->cfqe;
>> + service_tree = cfqe->service_tree;
>>
>> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> cfq_rb_erase(&cfqe->rb_node,
>> cfqe->service_tree);
>> + service_tree->total_weight -= cfqe->weight;
>> cfqe->service_tree = NULL;
>> }
>> if (cfqq->p_root) {
>> @@ -2125,24 +2184,34 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>> }
>> }
>>
>> +/*
>> + * The time when a CFQ queue is put onto a service tree is recoreded in
>> + * cfqq->reposition_time. Currently, we check the first priority CFQ queues
>> + * on each service tree, and select the workload type that contain the lowest
>> + * reposition_time CFQ queue among them.
>> + */
>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> struct cfq_entity *cfqe;
>> + struct cfq_queue *cfqq;
>> + unsigned long lowest_start_time;
>> int i;
>> - bool key_valid = false;
>> - unsigned long lowest_key = 0;
>> + bool time_valid = false;
>> enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
>>
>> + /*
>> + * TODO: We may take io priority into account when choosing a workload
>> + * type. But for the time being just make use of reposition_time only.
>> + */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> - /* select the one with lowest rb_key */
>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - if (cfqe &&
>> - (!key_valid ||
>> - time_before(cfqe->rb_key, lowest_key))) {
>> - lowest_key = cfqe->rb_key;
>> + cfqq = cfqq_of_entity(cfqe);
>> + if (cfqe && (!time_valid ||
>> + cfqq->reposition_time < lowest_start_time)) {
>
> do you need to use here time_before() etc marco to take care of
> jifies/reposition_time wrapping?
Ok
>
>> + lowest_start_time = cfqq->reposition_time;
>> cur_best = i;
>> - key_valid = true;
>> + time_valid = true;
>> }
>> }
>>
>> @@ -2814,10 +2883,13 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
>> {
>> struct task_struct *tsk = current;
>> int ioprio_class;
>> + struct cfq_entity *cfqe;
>>
>> if (!cfq_cfqq_prio_changed(cfqq))
>> return;
>>
>> + cfqe = &cfqq->cfqe;
>> +
>> ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
>> switch (ioprio_class) {
>> default:
>> @@ -2844,6 +2916,8 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
>> break;
>> }
>>
>> + cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
>> +
>
> Same here, can you update cfqe->weight while you are on service tree. In
> the past I could not and we had to maintain a serparate variable where
> we stored the new weight and once we requeue the entity we used to
> process the new weight.
>
>> /*
>> * keep track of original prio settings in case we have to temporarily
>> * elevate the priority of this queue
>> @@ -3578,6 +3652,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
>> */
>> static void cfq_prio_boost(struct cfq_queue *cfqq)
>> {
>> + struct cfq_entity *cfqe;
>> +
>> + cfqe = &cfqq->cfqe;
>> if (has_fs_excl()) {
>> /*
>> * boost idle prio on transactions that would lock out other
>> @@ -3594,6 +3671,11 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
>> cfqq->ioprio_class = cfqq->org_ioprio_class;
>> cfqq->ioprio = cfqq->org_ioprio;
>> }
>> +
>> + /*
>> + * update the io weight if io priority gets changed.
>> + */
>> + cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
>
> How do you know that this cfqe/cfqq is not already on the service tree? I
> don't think you can update weights while you are enqueued on the tree.
Vivek, I'm not sure why we can't update weights if cfqe is enqueued on the tree.
Do you mean when we update cfqe's weight we need to update tree's total_weight
accordingly?
>
>> }
>>
>> static inline int __cfq_may_queue(struct cfq_queue *cfqq)
>> --
>> 1.6.5.2
>>
>>
>>
>
--
Regards
Gui Jianfeng
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue
2010-12-14 2:41 ` Gui Jianfeng
@ 2010-12-14 2:47 ` Vivek Goyal
0 siblings, 0 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-14 2:47 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Tue, Dec 14, 2010 at 10:41:32AM +0800, Gui Jianfeng wrote:
[..]
> >> +
> >> + /*
> >> + * update the io weight if io priority gets changed.
> >> + */
> >> + cfqe->weight = cfq_prio_to_weight(cfqq->ioprio);
> >
> > How do you know that this cfqe/cfqq is not already on the service tree? I
> > don't think you can update weights while you are enqueued on the tree.
>
> Vivek, I'm not sure why we can't update weights if cfqe is enqueued on the tree.
> Do you mean when we update cfqe's weight we need to update tree's total_weight
> accordingly?
So far I can think of adjusting total_weight accordingly also if you are
updating the queue weight while it is on service tree.
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 4/8 v2] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group.
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
` (2 preceding siblings ...)
2010-12-13 1:44 ` [PATCH 3/8 v2] cfq-iosched: Introduce vdisktime and io weight for CFQ queue Gui Jianfeng
@ 2010-12-13 1:44 ` Gui Jianfeng
2010-12-13 22:11 ` Vivek Goyal
2010-12-13 1:45 ` [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level Gui Jianfeng
` (3 subsequent siblings)
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:44 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Extract some common code of service tree handling for CFQ queue and
CFQ group. This helps when CFQ queue and CFQ group are scheduling
together.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/cfq-iosched.c | 87 +++++++++++++++++++++------------------------------
1 files changed, 36 insertions(+), 51 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 30d19c0..6486956 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -863,12 +863,11 @@ entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
}
static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+__cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
struct rb_node **node = &st->rb.rb_node;
struct rb_node *parent = NULL;
struct cfq_entity *__cfqe;
- struct cfq_entity *cfqe = &cfqg->cfqe;
s64 key = entity_key(st, cfqe);
int left = 1;
@@ -892,6 +891,14 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
}
static void
+cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ st->total_weight += cfqe->weight;
+}
+
+static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
@@ -915,8 +922,23 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
} else
cfqe->vdisktime = st->min_vdisktime;
- __cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqe->weight;
+ cfq_entity_service_tree_add(st, cfqe);
+}
+
+static void
+__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+ cfq_rb_erase(&cfqe->rb_node, st);
+}
+
+static void
+cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
+{
+ if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
+ __cfq_entity_service_tree_del(st, cfqe);
+ st->total_weight -= cfqe->weight;
+ cfqe->service_tree = NULL;
+ }
}
static void
@@ -933,9 +955,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
return;
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- st->total_weight -= cfqe->weight;
- if (!RB_EMPTY_NODE(&cfqe->rb_node))
- cfq_rb_erase(&cfqe->rb_node, st);
+ cfq_entity_service_tree_del(st, cfqe);
cfqg->saved_workload_slice = 0;
cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
@@ -984,9 +1004,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */
- cfq_rb_erase(&cfqe->rb_node, st);
+ __cfq_entity_service_tree_del(st, cfqe);
cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_group_service_tree_add(st, cfqg);
+ __cfq_entity_service_tree_add(st, cfqe);
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1223,13 +1243,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
bool add_front)
{
struct cfq_entity *cfqe;
- struct rb_node **p, *parent;
+ struct rb_node *parent;
struct cfq_entity *__cfqe;
struct cfq_rb_root *service_tree, *orig_st;
- int left;
int new_cfqq = 1;
int group_changed = 0;
- s64 key;
cfqe = &cfqq->cfqe;
orig_st = cfqe->service_tree;
@@ -1246,8 +1264,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* Group changed, dequeue this CFQ queue from the
* original service tree.
*/
- cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
- orig_st->total_weight -= cfqe->weight;
+ cfq_entity_service_tree_del(orig_st, cfqe);
}
cfqq->orig_cfqg = cfqq->cfqg;
cfqq->cfqg = &cfqd->root_group;
@@ -1263,8 +1280,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* Group changed, dequeue this CFQ queue from the
* original service tree.
*/
- cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
- orig_st->total_weight -= cfqe->weight;
+ cfq_entity_service_tree_del(orig_st, cfqe);
}
cfq_put_cfqg(cfqq->cfqg);
cfqq->cfqg = cfqq->orig_cfqg;
@@ -1310,8 +1326,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* Ok, we get here, this CFQ queue is on the service tree, dequeue it
* firstly.
*/
- cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
- orig_st->total_weight -= cfqe->weight;
+ cfq_entity_service_tree_del(orig_st, cfqe);
new_cfqq = 0;
@@ -1338,39 +1353,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
insert:
- left = 1;
- parent = NULL;
cfqe->service_tree = service_tree;
- p = &service_tree->rb.rb_node;
- key = entity_key(service_tree, cfqe);
- while (*p) {
- struct rb_node **n;
-
- parent = *p;
- __cfqe = rb_entry(parent, struct cfq_entity,
- rb_node);
-
- /*
- * sort by key, that represents service time.
- */
- if (key < entity_key(service_tree, __cfqe))
- n = &(*p)->rb_left;
- else {
- n = &(*p)->rb_right;
- left = 0;
- }
- p = n;
- }
+ /* Add cfqq onto service tree. */
+ cfq_entity_service_tree_add(service_tree, cfqe);
- if (left)
- service_tree->left = &cfqe->rb_node;
-
- rb_link_node(&cfqe->rb_node, parent, p);
- rb_insert_color(&cfqe->rb_node, &service_tree->rb);
update_min_vdisktime(service_tree);
- service_tree->count++;
- service_tree->total_weight += cfqe->weight;
cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
@@ -1483,10 +1471,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
service_tree = cfqe->service_tree;
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
- cfq_rb_erase(&cfqe->rb_node,
- cfqe->service_tree);
- service_tree->total_weight -= cfqe->weight;
- cfqe->service_tree = NULL;
+ cfq_entity_service_tree_del(service_tree, cfqe);
}
if (cfqq->p_root) {
rb_erase(&cfqq->p_node, cfqq->p_root);
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 4/8 v2] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group.
2010-12-13 1:44 ` [PATCH 4/8 v2] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
@ 2010-12-13 22:11 ` Vivek Goyal
0 siblings, 0 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-13 22:11 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:44:54AM +0800, Gui Jianfeng wrote:
> Extract some common code of service tree handling for CFQ queue and
> CFQ group. This helps when CFQ queue and CFQ group are scheduling
> together.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/cfq-iosched.c | 87 +++++++++++++++++++++------------------------------
> 1 files changed, 36 insertions(+), 51 deletions(-)
This looks good to me.
Acked-by: Vivek Goyal <vgoyal@redhat.com>
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 30d19c0..6486956 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -863,12 +863,11 @@ entity_key(struct cfq_rb_root *st, struct cfq_entity *entity)
> }
>
> static void
> -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
> +__cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> struct rb_node **node = &st->rb.rb_node;
> struct rb_node *parent = NULL;
> struct cfq_entity *__cfqe;
> - struct cfq_entity *cfqe = &cfqg->cfqe;
> s64 key = entity_key(st, cfqe);
> int left = 1;
>
> @@ -892,6 +891,14 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
> }
>
> static void
> +cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> +{
> + __cfq_entity_service_tree_add(st, cfqe);
> + st->count++;
> + st->total_weight += cfqe->weight;
> +}
> +
> +static void
> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> struct cfq_rb_root *st = &cfqd->grp_service_tree;
> @@ -915,8 +922,23 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> } else
> cfqe->vdisktime = st->min_vdisktime;
>
> - __cfq_group_service_tree_add(st, cfqg);
> - st->total_weight += cfqe->weight;
> + cfq_entity_service_tree_add(st, cfqe);
> +}
> +
> +static void
> +__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> +{
> + cfq_rb_erase(&cfqe->rb_node, st);
> +}
> +
> +static void
> +cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> +{
> + if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> + __cfq_entity_service_tree_del(st, cfqe);
> + st->total_weight -= cfqe->weight;
> + cfqe->service_tree = NULL;
> + }
> }
>
> static void
> @@ -933,9 +955,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> return;
>
> cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> - st->total_weight -= cfqe->weight;
> - if (!RB_EMPTY_NODE(&cfqe->rb_node))
> - cfq_rb_erase(&cfqe->rb_node, st);
> + cfq_entity_service_tree_del(st, cfqe);
> cfqg->saved_workload_slice = 0;
> cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> }
> @@ -984,9 +1004,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> charge = cfqq->allocated_slice;
>
> /* Can't update vdisktime while group is on service tree */
> - cfq_rb_erase(&cfqe->rb_node, st);
> + __cfq_entity_service_tree_del(st, cfqe);
> cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> - __cfq_group_service_tree_add(st, cfqg);
> + __cfq_entity_service_tree_add(st, cfqe);
>
> /* This group is being expired. Save the context */
> if (time_after(cfqd->workload_expires, jiffies)) {
> @@ -1223,13 +1243,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> bool add_front)
> {
> struct cfq_entity *cfqe;
> - struct rb_node **p, *parent;
> + struct rb_node *parent;
> struct cfq_entity *__cfqe;
> struct cfq_rb_root *service_tree, *orig_st;
> - int left;
> int new_cfqq = 1;
> int group_changed = 0;
> - s64 key;
>
> cfqe = &cfqq->cfqe;
> orig_st = cfqe->service_tree;
> @@ -1246,8 +1264,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> * Group changed, dequeue this CFQ queue from the
> * original service tree.
> */
> - cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> - orig_st->total_weight -= cfqe->weight;
> + cfq_entity_service_tree_del(orig_st, cfqe);
> }
> cfqq->orig_cfqg = cfqq->cfqg;
> cfqq->cfqg = &cfqd->root_group;
> @@ -1263,8 +1280,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> * Group changed, dequeue this CFQ queue from the
> * original service tree.
> */
> - cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> - orig_st->total_weight -= cfqe->weight;
> + cfq_entity_service_tree_del(orig_st, cfqe);
> }
> cfq_put_cfqg(cfqq->cfqg);
> cfqq->cfqg = cfqq->orig_cfqg;
> @@ -1310,8 +1326,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> * Ok, we get here, this CFQ queue is on the service tree, dequeue it
> * firstly.
> */
> - cfq_rb_erase(&cfqe->rb_node, cfqe->service_tree);
> - orig_st->total_weight -= cfqe->weight;
> + cfq_entity_service_tree_del(orig_st, cfqe);
>
> new_cfqq = 0;
>
> @@ -1338,39 +1353,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> }
>
> insert:
> - left = 1;
> - parent = NULL;
> cfqe->service_tree = service_tree;
> - p = &service_tree->rb.rb_node;
> - key = entity_key(service_tree, cfqe);
> - while (*p) {
> - struct rb_node **n;
> -
> - parent = *p;
> - __cfqe = rb_entry(parent, struct cfq_entity,
> - rb_node);
> -
> - /*
> - * sort by key, that represents service time.
> - */
> - if (key < entity_key(service_tree, __cfqe))
> - n = &(*p)->rb_left;
> - else {
> - n = &(*p)->rb_right;
> - left = 0;
> - }
>
> - p = n;
> - }
> + /* Add cfqq onto service tree. */
> + cfq_entity_service_tree_add(service_tree, cfqe);
>
> - if (left)
> - service_tree->left = &cfqe->rb_node;
> -
> - rb_link_node(&cfqe->rb_node, parent, p);
> - rb_insert_color(&cfqe->rb_node, &service_tree->rb);
> update_min_vdisktime(service_tree);
> - service_tree->count++;
> - service_tree->total_weight += cfqe->weight;
> cfqq->reposition_time = jiffies;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> @@ -1483,10 +1471,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> service_tree = cfqe->service_tree;
>
> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> - cfq_rb_erase(&cfqe->rb_node,
> - cfqe->service_tree);
> - service_tree->total_weight -= cfqe->weight;
> - cfqe->service_tree = NULL;
> + cfq_entity_service_tree_del(service_tree, cfqe);
> }
> if (cfqq->p_root) {
> rb_erase(&cfqq->p_node, cfqq->p_root);
> --
> 1.6.5.2
>
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
` (3 preceding siblings ...)
2010-12-13 1:44 ` [PATCH 4/8 v2] cfq-iosched: Extract some common code of service tree handling for CFQ queue and CFQ group Gui Jianfeng
@ 2010-12-13 1:45 ` Gui Jianfeng
2010-12-14 3:49 ` Vivek Goyal
2010-12-13 1:45 ` [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality Gui Jianfeng
` (2 subsequent siblings)
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:45 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
This patch makes CFQ queue and CFQ group schedules at the same level.
Consider the following hierarchy:
Root
/ | \
q1 q2 G1
/ \
q3 G2
q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
are schedluing under G1. Note, for the time being, CFQ group is treated
as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
means service differentiate only happens in "BE and SYNC" service tree.
Later, we may introduce "IO Class" for CFQ group.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
1 files changed, 321 insertions(+), 152 deletions(-)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6486956..d90627e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -105,6 +105,9 @@ struct cfq_entity {
u64 vdisktime;
bool is_group_entity;
unsigned int weight;
+ struct cfq_entity *parent;
+ /* Reposition time */
+ unsigned long reposition_time;
};
/*
@@ -113,8 +116,6 @@ struct cfq_entity {
struct cfq_queue {
/* The schedule entity */
struct cfq_entity cfqe;
- /* Reposition time */
- unsigned long reposition_time;
/* reference count */
atomic_t ref;
/* various state flags, see below */
@@ -194,6 +195,9 @@ struct cfq_group {
/* number of cfqq currently on this group */
int nr_cfqq;
+ /* number of sub cfq groups */
+ int nr_subgp;
+
/*
* Per group busy queus average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
@@ -229,8 +233,6 @@ struct cfq_group {
*/
struct cfq_data {
struct request_queue *queue;
- /* Root service tree for cfq_groups */
- struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;
/*
@@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
return NULL;
}
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
enum wl_prio_t prio,
enum wl_type_t type)
@@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_rb_root *st = cfqe->service_tree;
- return cfq_target_latency * cfqe->weight / st->total_weight;
+ if (st)
+ return cfq_target_latency * cfqe->weight
+ / st->total_weight;
+ else
+ /* If this is the root group, give it a full slice. */
+ return cfq_target_latency;
}
static inline void
@@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}
-static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
-{
- if (!root->left)
- root->left = rb_first(&root->rb);
-
- if (root->left)
- return rb_entry_entity(root->left);
-
- return NULL;
-}
-
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
@@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
rb_link_node(&cfqe->rb_node, parent, node);
rb_insert_color(&cfqe->rb_node, &st->rb);
+
+ update_min_vdisktime(st);
}
static void
cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
__cfq_entity_service_tree_add(st, cfqe);
+ cfqe->reposition_time = jiffies;
st->count++;
st->total_weight += cfqe->weight;
}
@@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
static void
cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
- struct cfq_entity *__cfqe;
struct rb_node *n;
+ struct cfq_entity *entity;
+ struct cfq_rb_root *st;
+ struct cfq_group *__cfqg;
cfqg->nr_cfqq++;
+
+ /*
+ * Root group doesn't belongs to any service
+ */
+ if (cfqg == &cfqd->root_group)
+ return;
+
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;
/*
- * Currently put the group at the end. Later implement something
- * so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * Enqueue this group and its ancestors onto their service tree.
*/
- n = rb_last(&st->rb);
- if (n) {
- __cfqe = rb_entry_entity(n);
- cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
- } else
- cfqe->vdisktime = st->min_vdisktime;
+ while (cfqe && cfqe->parent) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ return;
+
+ /*
+ * Currently put the group at the end. Later implement
+ * something so that groups get lesser vtime based on their
+ * weights, so that if group does not loose all if it was not
+ * continously backlogged.
+ */
+ st = cfqe->service_tree;
+ n = rb_last(&st->rb);
+ if (n) {
+ entity = rb_entry_entity(n);
+ cfqe->vdisktime = entity->vdisktime +
+ CFQ_IDLE_DELAY;
+ } else
+ cfqe->vdisktime = st->min_vdisktime;
- cfq_entity_service_tree_add(st, cfqe);
+ cfq_entity_service_tree_add(st, cfqe);
+ cfqe = cfqe->parent;
+ __cfqg = cfqg_of_entity(cfqe);
+ __cfqg->nr_subgp++;
+ }
}
static void
__cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
{
cfq_rb_erase(&cfqe->rb_node, st);
+ update_min_vdisktime(st);
}
static void
@@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
__cfq_entity_service_tree_del(st, cfqe);
st->total_weight -= cfqe->weight;
- cfqe->service_tree = NULL;
}
}
static void
cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_entity *cfqe = &cfqg->cfqe;
+ struct cfq_group *__cfqg, *p_cfqg;
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
+ /*
+ * Root group doesn't belongs to any service
+ */
+ if (cfqg == &cfqd->root_group)
+ return;
+
/* If there are other cfq queues under this group, don't delete it */
if (cfqg->nr_cfqq)
return;
- cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- cfq_entity_service_tree_del(st, cfqe);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ /* If child group exists, don't dequeue it */
+ if (cfqg->nr_subgp)
+ return;
+
+ /*
+ * Dequeue this group and its ancestors from their service tree.
+ */
+ while (cfqe && cfqe->parent) {
+ __cfqg = cfqg_of_entity(cfqe);
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ cfqe = cfqe->parent;
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -989,7 +1029,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
unsigned int used_sl, charge;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
@@ -1003,10 +1042,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
- /* Can't update vdisktime while group is on service tree */
- __cfq_entity_service_tree_del(st, cfqe);
- cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_entity_service_tree_add(st, cfqe);
+ /*
+ * Update the vdisktime on the whole chain.
+ */
+ while (cfqe && cfqe->parent) {
+ struct cfq_rb_root *st = cfqe->service_tree;
+
+ /* Can't update vdisktime while group is on service tree */
+ __cfq_entity_service_tree_del(st, cfqe);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ cfqe->reposition_time = jiffies;
+ cfqe = cfqe->parent;
+ }
+
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -1018,7 +1068,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfqg->saved_workload_slice = 0;
cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
- cfqe->vdisktime, st->min_vdisktime);
+ cfqg->cfqe.vdisktime,
+ cfqg->cfqe.service_tree->min_vdisktime);
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
@@ -1040,35 +1091,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void init_cfqe(struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
+{
+ struct cfq_entity *cfqe = &cfqg->cfqe;
+
+ cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ RB_CLEAR_NODE(&cfqe->rb_node);
+ cfqe->is_group_entity = true;
+ cfqe->parent = NULL;
+}
+
+static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
+ struct cfq_group *cfqg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
int i, j;
struct cfq_rb_root *st;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
-
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
-
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- goto done;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
-
- cfqg->cfqe.is_group_entity = true;
/*
* Take the initial reference that will be released on destroy
@@ -1078,24 +1121,119 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
atomic_set(&cfqg->ref, 1);
+ /* Add group onto cgroup list */
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ MKDEV(major, minor));
+ /* Initiate group entity */
+ init_cfqe(blkcg, cfqg);
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
+
+static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+}
+
+static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
+ struct cfq_group *p_cfqg)
+{
+ struct cfq_entity *cfqe, *p_cfqe;
+
+ cfqe = &cfqg->cfqe;
+
+ p_cfqe = &p_cfqg->cfqe;
+
+ cfqe->parent = p_cfqe;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initiliazed yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * Currently, just put cfq group entity on "BE:SYNC" workload
+ * service tree.
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
+ cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
+ SYNC_WORKLOAD);
+ /* child reference */
+ atomic_inc(&p_cfqg->ref);
+}
- cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct blkio_cgroup *p_blkcg;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg, *p_cfqg;
+ void *key = cfqd;
+ int ret;
- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+ /* chain has already been built */
+ return 0;
+ }
+
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+ if (!cfqg)
+ return -1;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+
+ /* Already to the top group */
+ if (!cgroup->parent)
+ return 0;
+
+ /* Allocate CFQ groups on the chain */
+ ret = cfqg_chain_alloc(cfqd, cgroup->parent);
+ if (ret == -1) {
+ uninit_cfqg(cfqd, cfqg);
+ return -1;
+ }
+
+ p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+ p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
+ BUG_ON(p_cfqg == NULL);
+
+ cfqg_set_parent(cfqd, cfqg, p_cfqg);
+ return 0;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ int ret;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
+ if (cfqg || !create)
+ goto done;
+
+ /*
+ * For hierarchical cfq group scheduling, we need to allocate
+ * the whole cfq group chain.
+ */
+ ret = cfqg_chain_alloc(cfqd, cgroup);
+ if (!ret) {
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ BUG_ON(cfqg == NULL);
+ goto done;
+ }
done:
return cfqg;
@@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
{
struct cfq_rb_root *st;
int i, j;
+ struct cfq_entity *cfqe;
+ struct cfq_group *p_cfqg;
BUG_ON(atomic_read(&cfqg->ref) <= 0);
if (!atomic_dec_and_test(&cfqg->ref))
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+
+ cfqe = &cfqg->cfqe;
+ if (cfqe->parent) {
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ /* Drop the reference taken by children */
+ atomic_dec(&p_cfqg->ref);
+ }
+
kfree(cfqg);
}
@@ -1358,8 +1506,6 @@ insert:
/* Add cfqq onto service tree. */
cfq_entity_service_tree_add(service_tree, cfqe);
- update_min_vdisktime(service_tree);
- cfqq->reposition_time = jiffies;
if ((add_front || !new_cfqq) && !group_changed)
return;
cfq_group_service_tree_add(cfqd, cfqq->cfqg);
@@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
return cfqq_of_entity(cfq_rb_first(service_tree));
}
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+static struct cfq_entity *
+cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
+ struct cfq_entity *entity;
int i, j;
struct cfq_rb_root *st;
if (!cfqd->rq_queued)
return NULL;
- cfqg = cfq_get_next_cfqg(cfqd);
- if (!cfqg)
- return NULL;
-
for_each_cfqg_st(cfqg, i, j, st) {
- cfqe = cfq_rb_first(st);
- if (cfqe != NULL)
- return cfqq_of_entity(cfqe);
+ entity = cfq_rb_first(st);
+
+ if (entity && !entity->is_group_entity)
+ return entity;
+ else if (entity && entity->is_group_entity) {
+ cfqg = cfqg_of_entity(entity);
+ return cfq_get_next_entity_forced(cfqd, cfqg);
+ }
}
return NULL;
}
+
/*
* Get and set a new active queue for service.
*/
@@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
struct cfq_group *cfqg, enum wl_prio_t prio)
{
struct cfq_entity *cfqe;
- struct cfq_queue *cfqq;
unsigned long lowest_start_time;
int i;
bool time_valid = false;
@@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
*/
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
- cfqq = cfqq_of_entity(cfqe);
if (cfqe && (!time_valid ||
- cfqq->reposition_time < lowest_start_time)) {
- lowest_start_time = cfqq->reposition_time;
+ cfqe->reposition_time < lowest_start_time)) {
+ lowest_start_time = cfqe->reposition_time;
cur_best = i;
time_valid = true;
}
@@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
return cur_best;
}
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
unsigned slice;
unsigned count;
struct cfq_rb_root *st;
unsigned group_slice;
- if (!cfqg) {
- cfqd->serving_prio = IDLE_WORKLOAD;
- cfqd->workload_expires = jiffies + 1;
- return;
- }
-
- /* Choose next priority. RT > BE > IDLE */
- if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = RT_WORKLOAD;
- else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = BE_WORKLOAD;
- else {
- cfqd->serving_prio = IDLE_WORKLOAD;
- cfqd->workload_expires = jiffies + 1;
- return;
- }
-
- /*
- * For RT and BE, we have to choose also the type
- * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
- * expiration time
- */
- st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
- count = st->count;
-
- /*
- * check workload expiration, and that we still have other queues ready
- */
- if (count && !time_after(jiffies, cfqd->workload_expires))
- return;
-
- /* otherwise select new workload type */
- cfqd->serving_type =
- cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
count = st->count;
@@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfqd->workload_expires = jiffies + slice;
}
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
- struct cfq_group *cfqg;
- struct cfq_entity *cfqe;
+ struct cfq_rb_root *st;
+ unsigned count;
- if (RB_EMPTY_ROOT(&st->rb))
- return NULL;
- cfqe = cfq_rb_first_entity(st);
- cfqg = cfqg_of_entity(cfqe);
- BUG_ON(!cfqg);
- update_min_vdisktime(st);
- return cfqg;
+ if (!cfqg) {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /* Choose next priority. RT > BE > IDLE */
+ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = RT_WORKLOAD;
+ else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = BE_WORKLOAD;
+ else {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /*
+ * For RT and BE, we have to choose also the type
+ * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
+ * expiration time
+ */
+ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ count = st->count;
+
+ /*
+ * check workload expiration, and that we still have other queues ready
+ */
+ if (count && !time_after(jiffies, cfqd->workload_expires))
+ return;
+
+ /* otherwise select new workload type */
+ cfqd->serving_type =
+ cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
}
-static void cfq_choose_cfqg(struct cfq_data *cfqd)
+struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
+ struct cfq_group *cfqg)
{
- struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
-
- cfqd->serving_group = cfqg;
+ struct cfq_rb_root *service_tree;
/* Restore the workload type data */
if (cfqg->saved_workload_slice) {
@@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
cfqd->workload_expires = jiffies - 1;
choose_service_tree(cfqd, cfqg);
-}
+ service_tree = service_tree_for(cfqg, cfqd->serving_prio,
+ cfqd->serving_type);
+
+ if (!cfqd->rq_queued)
+ return NULL;
+
+ /* There is nothing to dispatch */
+ if (!service_tree)
+ return NULL;
+ if (RB_EMPTY_ROOT(&service_tree->rb))
+ return NULL;
+
+ return cfq_rb_first(service_tree);
+}
/*
* Select a queue for service. If we have a current active queue,
* check whether to continue servicing it, or retrieve and set a new one.
@@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
+ struct cfq_group *cfqg;
+ struct cfq_entity *entity;
cfqq = cfqd->active_queue;
if (!cfqq)
@@ -2422,8 +2574,23 @@ new_queue:
* Current queue expired. Check if we have to switch to a new
* service tree
*/
- if (!new_cfqq)
- cfq_choose_cfqg(cfqd);
+ cfqg = &cfqd->root_group;
+
+ if (!new_cfqq) {
+ do {
+ entity = choose_serving_entity(cfqd, cfqg);
+ if (entity && !entity->is_group_entity) {
+ /* This is the CFQ queue that should run */
+ new_cfqq = cfqq_of_entity(entity);
+ cfqd->serving_group = cfqg;
+ set_workload_expire(cfqd, cfqg);
+ break;
+ } else if (entity && entity->is_group_entity) {
+ /* Continue to lookup in this CFQ group */
+ cfqg = cfqg_of_entity(entity);
+ }
+ } while (entity && entity->is_group_entity);
+ }
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
@@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq;
int dispatched = 0;
+ struct cfq_entity *entity;
+ struct cfq_group *root = &cfqd->root_group;
/* Expire the timeslice of the current active queue first */
cfq_slice_expired(cfqd, 0);
- while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+ while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
+ BUG_ON(entity->is_group_entity);
+ cfqq = cfqq_of_entity(entity);
__cfq_set_active_queue(cfqd, cfqq);
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
}
@@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cic_index = i;
- /* Init root service tree */
- cfqd->grp_service_tree = CFQ_RB_ROOT;
-
/* Init root group */
cfqg = &cfqd->root_group;
for_each_cfqg_st(cfqg, i, j, st)
@@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
/* Give preference to root group over other groups */
cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
cfqg->cfqe.is_group_entity = true;
+ cfqg->cfqe.parent = NULL;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level
2010-12-13 1:45 ` [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level Gui Jianfeng
@ 2010-12-14 3:49 ` Vivek Goyal
2010-12-14 6:09 ` Gui Jianfeng
2010-12-15 7:02 ` Gui Jianfeng
0 siblings, 2 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-14 3:49 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:45:01AM +0800, Gui Jianfeng wrote:
> This patch makes CFQ queue and CFQ group schedules at the same level.
> Consider the following hierarchy:
>
> Root
> / | \
> q1 q2 G1
> / \
> q3 G2
>
> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
> q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
> are schedluing under G1. Note, for the time being, CFQ group is treated
> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
> means service differentiate only happens in "BE and SYNC" service tree.
> Later, we may introduce "IO Class" for CFQ group.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
> 1 files changed, 321 insertions(+), 152 deletions(-)
>
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 6486956..d90627e 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -105,6 +105,9 @@ struct cfq_entity {
> u64 vdisktime;
> bool is_group_entity;
> unsigned int weight;
> + struct cfq_entity *parent;
> + /* Reposition time */
> + unsigned long reposition_time;
> };
>
> /*
> @@ -113,8 +116,6 @@ struct cfq_entity {
> struct cfq_queue {
> /* The schedule entity */
> struct cfq_entity cfqe;
> - /* Reposition time */
> - unsigned long reposition_time;
> /* reference count */
> atomic_t ref;
> /* various state flags, see below */
> @@ -194,6 +195,9 @@ struct cfq_group {
> /* number of cfqq currently on this group */
> int nr_cfqq;
>
> + /* number of sub cfq groups */
> + int nr_subgp;
> +
Do you really have to maintain separate count for child queue and
child groups? Will a common count something like nr_children be
not sufficient.
> /*
> * Per group busy queus average. Useful for workload slice calc. We
> * create the array for each prio class but at run time it is used
> @@ -229,8 +233,6 @@ struct cfq_group {
> */
> struct cfq_data {
> struct request_queue *queue;
> - /* Root service tree for cfq_groups */
> - struct cfq_rb_root grp_service_tree;
I see that you are removing this service tree here and then adding it
back in patch 7. I think it is confusing. In fact title of patch 7 is
add flat mode. The fact the flat mode is already supported and we
are just adding hierarchical mode on top of it. I think this is
just a matter of better naming and patch organization so that
it is more clear.
> struct cfq_group root_group;
>
> /*
> @@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
> return NULL;
> }
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> enum wl_prio_t prio,
> enum wl_type_t type)
> @@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> static inline unsigned
> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_rb_root *st = cfqe->service_tree;
>
> - return cfq_target_latency * cfqe->weight / st->total_weight;
> + if (st)
> + return cfq_target_latency * cfqe->weight
> + / st->total_weight;
Is it still true in hierarchical mode. Previously group used to be
at the top and there used to be only one service tree for groups so
st->total_weight represented total weight in the system.
Now with hierarhcy this will not/should not be true. So a group slice
calculation should be different?
> + else
> + /* If this is the root group, give it a full slice. */
> + return cfq_target_latency;
> }
>
> static inline void
> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> return NULL;
> }
>
> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> -{
> - if (!root->left)
> - root->left = rb_first(&root->rb);
> -
> - if (root->left)
> - return rb_entry_entity(root->left);
> -
> - return NULL;
> -}
> -
> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> {
> rb_erase(n, root);
> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>
> rb_link_node(&cfqe->rb_node, parent, node);
> rb_insert_color(&cfqe->rb_node, &st->rb);
> +
> + update_min_vdisktime(st);
> }
>
> static void
> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> __cfq_entity_service_tree_add(st, cfqe);
> + cfqe->reposition_time = jiffies;
> st->count++;
> st->total_weight += cfqe->weight;
> }
> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> static void
> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> - struct cfq_entity *__cfqe;
> struct rb_node *n;
> + struct cfq_entity *entity;
> + struct cfq_rb_root *st;
> + struct cfq_group *__cfqg;
>
> cfqg->nr_cfqq++;
> +
> + /*
> + * Root group doesn't belongs to any service
> + */
> + if (cfqg == &cfqd->root_group)
> + return;
Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
there will be only 1 group on grp service tree and in flat mode there
can be many.
> +
> if (!RB_EMPTY_NODE(&cfqe->rb_node))
> return;
>
> /*
> - * Currently put the group at the end. Later implement something
> - * so that groups get lesser vtime based on their weights, so that
> - * if group does not loose all if it was not continously backlogged.
> + * Enqueue this group and its ancestors onto their service tree.
> */
> - n = rb_last(&st->rb);
> - if (n) {
> - __cfqe = rb_entry_entity(n);
> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> - } else
> - cfqe->vdisktime = st->min_vdisktime;
> + while (cfqe && cfqe->parent) {
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + return;
> +
> + /*
> + * Currently put the group at the end. Later implement
> + * something so that groups get lesser vtime based on their
> + * weights, so that if group does not loose all if it was not
> + * continously backlogged.
> + */
> + st = cfqe->service_tree;
> + n = rb_last(&st->rb);
> + if (n) {
> + entity = rb_entry_entity(n);
> + cfqe->vdisktime = entity->vdisktime +
> + CFQ_IDLE_DELAY;
> + } else
> + cfqe->vdisktime = st->min_vdisktime;
>
> - cfq_entity_service_tree_add(st, cfqe);
> + cfq_entity_service_tree_add(st, cfqe);
> + cfqe = cfqe->parent;
> + __cfqg = cfqg_of_entity(cfqe);
> + __cfqg->nr_subgp++;
> + }
> }
>
> static void
> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> cfq_rb_erase(&cfqe->rb_node, st);
> + update_min_vdisktime(st);
> }
>
> static void
> @@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> __cfq_entity_service_tree_del(st, cfqe);
> st->total_weight -= cfqe->weight;
> - cfqe->service_tree = NULL;
> }
> }
>
> static void
> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_group *__cfqg, *p_cfqg;
>
> BUG_ON(cfqg->nr_cfqq < 1);
> cfqg->nr_cfqq--;
>
> + /*
> + * Root group doesn't belongs to any service
> + */
> + if (cfqg == &cfqd->root_group)
> + return;
> +
> /* If there are other cfq queues under this group, don't delete it */
> if (cfqg->nr_cfqq)
> return;
>
> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> - cfq_entity_service_tree_del(st, cfqe);
> - cfqg->saved_workload_slice = 0;
> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> + /* If child group exists, don't dequeue it */
> + if (cfqg->nr_subgp)
> + return;
> +
> + /*
> + * Dequeue this group and its ancestors from their service tree.
> + */
> + while (cfqe && cfqe->parent) {
> + __cfqg = cfqg_of_entity(cfqe);
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> + __cfqg->saved_workload_slice = 0;
> + cfqe = cfqe->parent;
> + p_cfqg->nr_subgp--;
> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> + return;
> + }
> }
I think one you merge queue/group algorithms, you can use same function
for adding/deleting queue/group entities and don't have to use separate
functions for groups?
[..]
> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct blkio_cgroup *p_blkcg;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> + struct cfq_group *cfqg, *p_cfqg;
> + void *key = cfqd;
> + int ret;
>
> - /* Add group on cfqd list */
> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg) {
> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + }
> + /* chain has already been built */
> + return 0;
> + }
> +
> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> + if (!cfqg)
> + return -1;
> +
> + init_cfqg(cfqd, blkcg, cfqg);
> +
> + /* Already to the top group */
> + if (!cgroup->parent)
> + return 0;
> +
> + /* Allocate CFQ groups on the chain */
> + ret = cfqg_chain_alloc(cfqd, cgroup->parent);
Can you avoid recursion and user for/while loops to initialize the
chain. Don't want to push multiple stack frames in case of a deep hier.
> + if (ret == -1) {
> + uninit_cfqg(cfqd, cfqg);
> + return -1;
> + }
> +
> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
> + BUG_ON(p_cfqg == NULL);
> +
> + cfqg_set_parent(cfqd, cfqg, p_cfqg);
> + return 0;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct cfq_group *cfqg = NULL;
> + void *key = cfqd;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> + int ret;
> +
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + goto done;
> + }
> + if (cfqg || !create)
> + goto done;
> +
> + /*
> + * For hierarchical cfq group scheduling, we need to allocate
> + * the whole cfq group chain.
> + */
> + ret = cfqg_chain_alloc(cfqd, cgroup);
> + if (!ret) {
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + BUG_ON(cfqg == NULL);
> + goto done;
> + }
>
> done:
> return cfqg;
> @@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> {
> struct cfq_rb_root *st;
> int i, j;
> + struct cfq_entity *cfqe;
> + struct cfq_group *p_cfqg;
>
> BUG_ON(atomic_read(&cfqg->ref) <= 0);
> if (!atomic_dec_and_test(&cfqg->ref))
> return;
> for_each_cfqg_st(cfqg, i, j, st)
> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> +
> + cfqe = &cfqg->cfqe;
> + if (cfqe->parent) {
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + /* Drop the reference taken by children */
> + atomic_dec(&p_cfqg->ref);
> + }
> +
Is it the right way to free up whole of the parent chain. just think that
in a hier of test1->test2->test3 somebody drops the reference to test3
and test1 and test2 don't have any other children. In that case after
freeing up test3, we should be freeing up test2 and test1 also.
I was thiking that we can achieve this by freeing up groups in a
loop.
do {
cfqe = cfqg->entity.parent;
if (!atomic_dec_and_test(&cfqg->ref))
return;
kfree(cfqg);
cfqg = cfqg_of_entity(cfqe);
} while(cfqg)
> kfree(cfqg);
> }
>
> @@ -1358,8 +1506,6 @@ insert:
> /* Add cfqq onto service tree. */
> cfq_entity_service_tree_add(service_tree, cfqe);
>
> - update_min_vdisktime(service_tree);
> - cfqq->reposition_time = jiffies;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> return cfqq_of_entity(cfq_rb_first(service_tree));
> }
>
> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> +static struct cfq_entity *
> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_group *cfqg;
> - struct cfq_entity *cfqe;
> + struct cfq_entity *entity;
> int i, j;
> struct cfq_rb_root *st;
>
> if (!cfqd->rq_queued)
> return NULL;
>
> - cfqg = cfq_get_next_cfqg(cfqd);
> - if (!cfqg)
> - return NULL;
> -
> for_each_cfqg_st(cfqg, i, j, st) {
> - cfqe = cfq_rb_first(st);
> - if (cfqe != NULL)
> - return cfqq_of_entity(cfqe);
> + entity = cfq_rb_first(st);
> +
> + if (entity && !entity->is_group_entity)
> + return entity;
> + else if (entity && entity->is_group_entity) {
> + cfqg = cfqg_of_entity(entity);
> + return cfq_get_next_entity_forced(cfqd, cfqg);
> + }
> }
> return NULL;
> }
Can above be siplified by just taking cfqd as parameter. Will work both
for hierarchical and flat mode. Wanted to avoid recursion as somebody
can create deep cgroup hierarchy and push lots of frames on stack.
struct cfq_entity *cfq_get_next_entity_forced(struct cfq_data *cfqd)
{
struct service_tree *st = cfqd->grp_service_tree;
do {
cfqe = cfq_rb_first(st);
if (is_cfqe_cfqq(cfqe))
return cfqe;
st = choose_service_tree_forced(cfqg);
} while (st);
}
And choose_service_tree_forced() can be something like.
choose_service_tree_forced(cfqg) {
for_each_cfqg_st() {
if (st->count !=0)
return st;
}
}
>
> +
> /*
> * Get and set a new active queue for service.
> */
> @@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> struct cfq_group *cfqg, enum wl_prio_t prio)
> {
> struct cfq_entity *cfqe;
> - struct cfq_queue *cfqq;
> unsigned long lowest_start_time;
> int i;
> bool time_valid = false;
> @@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> */
> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
> - cfqq = cfqq_of_entity(cfqe);
> if (cfqe && (!time_valid ||
> - cfqq->reposition_time < lowest_start_time)) {
> - lowest_start_time = cfqq->reposition_time;
> + cfqe->reposition_time < lowest_start_time)) {
> + lowest_start_time = cfqe->reposition_time;
> cur_best = i;
> time_valid = true;
> }
> @@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> return cur_best;
> }
>
> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> unsigned slice;
> unsigned count;
> struct cfq_rb_root *st;
> unsigned group_slice;
>
> - if (!cfqg) {
> - cfqd->serving_prio = IDLE_WORKLOAD;
> - cfqd->workload_expires = jiffies + 1;
> - return;
> - }
> -
> - /* Choose next priority. RT > BE > IDLE */
> - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> - cfqd->serving_prio = RT_WORKLOAD;
> - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> - cfqd->serving_prio = BE_WORKLOAD;
> - else {
> - cfqd->serving_prio = IDLE_WORKLOAD;
> - cfqd->workload_expires = jiffies + 1;
> - return;
> - }
> -
> - /*
> - * For RT and BE, we have to choose also the type
> - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> - * expiration time
> - */
> - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> - count = st->count;
> -
> - /*
> - * check workload expiration, and that we still have other queues ready
> - */
> - if (count && !time_after(jiffies, cfqd->workload_expires))
> - return;
> -
> - /* otherwise select new workload type */
> - cfqd->serving_type =
> - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> count = st->count;
>
> @@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> cfqd->workload_expires = jiffies + slice;
> }
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> - struct cfq_group *cfqg;
> - struct cfq_entity *cfqe;
> + struct cfq_rb_root *st;
> + unsigned count;
>
> - if (RB_EMPTY_ROOT(&st->rb))
> - return NULL;
> - cfqe = cfq_rb_first_entity(st);
> - cfqg = cfqg_of_entity(cfqe);
> - BUG_ON(!cfqg);
> - update_min_vdisktime(st);
> - return cfqg;
> + if (!cfqg) {
> + cfqd->serving_prio = IDLE_WORKLOAD;
> + cfqd->workload_expires = jiffies + 1;
> + return;
> + }
I am wondering where do we use above code. Do we ever call
choose_service_tree() with cfqg == NULL? Can't seem to figure out by
looking at the code.
> +
> + /* Choose next priority. RT > BE > IDLE */
> + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> + cfqd->serving_prio = RT_WORKLOAD;
> + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> + cfqd->serving_prio = BE_WORKLOAD;
> + else {
> + cfqd->serving_prio = IDLE_WORKLOAD;
> + cfqd->workload_expires = jiffies + 1;
> + return;
> + }
> +
> + /*
> + * For RT and BE, we have to choose also the type
> + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> + * expiration time
> + */
> + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> + count = st->count;
> +
> + /*
> + * check workload expiration, and that we still have other queues ready
> + */
> + if (count && !time_after(jiffies, cfqd->workload_expires))
> + return;
> +
> + /* otherwise select new workload type */
> + cfqd->serving_type =
> + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> }
>
> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
> +struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
> + struct cfq_group *cfqg)
I think for this function you don't have to pass cfqg as parameter. You
can just use cfqd as parameter and then take all decisions based on
service tree.
So at top we can continue to have grp_service_tree in cfqd. In
hierarchical mode it will only have root group queued there and in
flat mode it can have multiple groups queued.
Also I am looking forward to simplifying and organizing CFQ code little
better so that it is easy to read. Can chooser_serving entity function
be organized something as follows. This shoudl work both for flat and
hierarchical modes. Following is only a pesudo code.
struct cfq_entity *select_entity(struct cfq_data *cfqd)
{
struct cfq_rb_root *st = cfqd->grp_service_tree;
struct cfq_entity *cfqe;
do {
cfqe = cfq_rb_first(st);
if (is_cfqe_cfqq(cfqe))
/* We found the next queue to dispatch from */
break;
else
st = choose_service_tree();
} while (st)
return cfqe;
}
> {
> - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
> -
> - cfqd->serving_group = cfqg;
> + struct cfq_rb_root *service_tree;
>
> /* Restore the workload type data */
> if (cfqg->saved_workload_slice) {
> @@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> cfqd->workload_expires = jiffies - 1;
>
> choose_service_tree(cfqd, cfqg);
> -}
>
> + service_tree = service_tree_for(cfqg, cfqd->serving_prio,
> + cfqd->serving_type);
> +
> + if (!cfqd->rq_queued)
> + return NULL;
> +
> + /* There is nothing to dispatch */
> + if (!service_tree)
> + return NULL;
> + if (RB_EMPTY_ROOT(&service_tree->rb))
> + return NULL;
> +
> + return cfq_rb_first(service_tree);
> +}
> /*
> * Select a queue for service. If we have a current active queue,
> * check whether to continue servicing it, or retrieve and set a new one.
> @@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> {
> struct cfq_queue *cfqq, *new_cfqq = NULL;
> + struct cfq_group *cfqg;
> + struct cfq_entity *entity;
>
> cfqq = cfqd->active_queue;
> if (!cfqq)
> @@ -2422,8 +2574,23 @@ new_queue:
> * Current queue expired. Check if we have to switch to a new
> * service tree
> */
> - if (!new_cfqq)
> - cfq_choose_cfqg(cfqd);
> + cfqg = &cfqd->root_group;
> +
> + if (!new_cfqq) {
> + do {
> + entity = choose_serving_entity(cfqd, cfqg);
> + if (entity && !entity->is_group_entity) {
> + /* This is the CFQ queue that should run */
> + new_cfqq = cfqq_of_entity(entity);
> + cfqd->serving_group = cfqg;
> + set_workload_expire(cfqd, cfqg);
> + break;
> + } else if (entity && entity->is_group_entity) {
> + /* Continue to lookup in this CFQ group */
> + cfqg = cfqg_of_entity(entity);
> + }
> + } while (entity && entity->is_group_entity);
I think move above logic in a separate function otherwise select_queue()
is becoming complicated.
Secondly for traversing the hierarchy you can introduce macros like
for_each_entity() or for_each_cfqe() etc.
Thirdly I would again say that flat mode is already supported. Build on
top of it instead of first getting rid of it and then adding it back
with the help of a separate patch. If it is too complicated then let
it be a single patch instead of separating it out in two pathes.
> + }
>
> cfqq = cfq_set_active_queue(cfqd, new_cfqq);
> keep_queue:
> @@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> {
> struct cfq_queue *cfqq;
> int dispatched = 0;
> + struct cfq_entity *entity;
> + struct cfq_group *root = &cfqd->root_group;
>
> /* Expire the timeslice of the current active queue first */
> cfq_slice_expired(cfqd, 0);
> - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
> + while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
> + BUG_ON(entity->is_group_entity);
> + cfqq = cfqq_of_entity(entity);
> __cfq_set_active_queue(cfqd, cfqq);
> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
> }
> @@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)
>
> cfqd->cic_index = i;
>
> - /* Init root service tree */
> - cfqd->grp_service_tree = CFQ_RB_ROOT;
> -
> /* Init root group */
> cfqg = &cfqd->root_group;
> for_each_cfqg_st(cfqg, i, j, st)
> @@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
> /* Give preference to root group over other groups */
> cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
> cfqg->cfqe.is_group_entity = true;
> + cfqg->cfqe.parent = NULL;
>
> #ifdef CONFIG_CFQ_GROUP_IOSCHED
> /*
> --
> 1.6.5.2
>
>
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level
2010-12-14 3:49 ` Vivek Goyal
@ 2010-12-14 6:09 ` Gui Jianfeng
2010-12-15 7:02 ` Gui Jianfeng
1 sibling, 0 replies; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 6:09 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
...
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st;
>> + unsigned count;
>>
>> - if (RB_EMPTY_ROOT(&st->rb))
>> - return NULL;
>> - cfqe = cfq_rb_first_entity(st);
>> - cfqg = cfqg_of_entity(cfqe);
>> - BUG_ON(!cfqg);
>> - update_min_vdisktime(st);
>> - return cfqg;
>> + if (!cfqg) {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>
> I am wondering where do we use above code. Do we ever call
> choose_service_tree() with cfqg == NULL? Can't seem to figure out by
> looking at the code.
>
Vivek,
This piece of code comes from original CFQ code. Think more about it, this
piece of code seems redundant. When cfq_choose_cfqg() is called in select_queue(),
there must be at least one backlogged CFQ queue waiting for dispatching, hence
there must be at least one backlogged CFQ group on service tree. So we never call
choose_service_tree() with cfqg == NULL.
I'd like to post a seperate patch to get rid of this piece.
Thanks,
Gui
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level
2010-12-14 3:49 ` Vivek Goyal
2010-12-14 6:09 ` Gui Jianfeng
@ 2010-12-15 7:02 ` Gui Jianfeng
2010-12-15 22:04 ` Vivek Goyal
1 sibling, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-15 7:02 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:45:01AM +0800, Gui Jianfeng wrote:
>> This patch makes CFQ queue and CFQ group schedules at the same level.
>> Consider the following hierarchy:
>>
>> Root
>> / | \
>> q1 q2 G1
>> / \
>> q3 G2
>>
>> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. With this patch, q1,
>> q2 and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
>> are schedluing under G1. Note, for the time being, CFQ group is treated
>> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
>> means service differentiate only happens in "BE and SYNC" service tree.
>> Later, we may introduce "IO Class" for CFQ group.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/cfq-iosched.c | 473 ++++++++++++++++++++++++++++++++++----------------
>> 1 files changed, 321 insertions(+), 152 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 6486956..d90627e 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -105,6 +105,9 @@ struct cfq_entity {
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> + struct cfq_entity *parent;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> };
>>
>> /*
>> @@ -113,8 +116,6 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> - /* Reposition time */
>> - unsigned long reposition_time;
>> /* reference count */
>> atomic_t ref;
>> /* various state flags, see below */
>> @@ -194,6 +195,9 @@ struct cfq_group {
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>>
>> + /* number of sub cfq groups */
>> + int nr_subgp;
>> +
>
> Do you really have to maintain separate count for child queue and
> child groups? Will a common count something like nr_children be
> not sufficient.
Currently, nr_subgp is only effective in hierarchical mode, but nr_cfqq work
for both hierarchical and flat mode. So for the time being, we need separate
count for child queues and groups.
>
>> /*
>> * Per group busy queus average. Useful for workload slice calc. We
>> * create the array for each prio class but at run time it is used
>> @@ -229,8 +233,6 @@ struct cfq_group {
>> */
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>
> I see that you are removing this service tree here and then adding it
> back in patch 7. I think it is confusing. In fact title of patch 7 is
> add flat mode. The fact the flat mode is already supported and we
> are just adding hierarchical mode on top of it. I think this is
> just a matter of better naming and patch organization so that
> it is more clear.
Ok, will merge hierarchical patch and flat patch together.
>
>> struct cfq_group root_group;
>>
>> /*
>> @@ -347,8 +349,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>> return NULL;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> enum wl_prio_t prio,
>> enum wl_type_t type)
>> @@ -638,10 +638,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_rb_root *st = cfqe->service_tree;
>>
>> - return cfq_target_latency * cfqe->weight / st->total_weight;
>> + if (st)
>> + return cfq_target_latency * cfqe->weight
>> + / st->total_weight;
>
> Is it still true in hierarchical mode. Previously group used to be
> at the top and there used to be only one service tree for groups so
> st->total_weight represented total weight in the system.
>
> Now with hierarhcy this will not/should not be true. So a group slice
> calculation should be different?
I just keep the original group slice calculation here. I was thinking that
calculate group slice in a hierachical way, this might get a really small
group slice and not sure how it works. So I just keep the original calculation.
Any thoughts?
>
>> + else
>> + /* If this is the root group, give it a full slice. */
>> + return cfq_target_latency;
>> }
>>
>> static inline void
>> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> - if (!root->left)
>> - root->left = rb_first(&root->rb);
>> -
>> - if (root->left)
>> - return rb_entry_entity(root->left);
>> -
>> - return NULL;
>> -}
>> -
>> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>> {
>> rb_erase(n, root);
>> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>
>> rb_link_node(&cfqe->rb_node, parent, node);
>> rb_insert_color(&cfqe->rb_node, &st->rb);
>> +
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> __cfq_entity_service_tree_add(st, cfqe);
>> + cfqe->reposition_time = jiffies;
>> st->count++;
>> st->total_weight += cfqe->weight;
>> }
>> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>
> Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
> there will be only 1 group on grp service tree and in flat mode there
> can be many.
Keep top service tree different for hierarchical mode and flat mode is just
fine to me. If you don't strongly object, I'd to keep the current way. :)
>
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe && cfqe->parent) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>> +
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on their
>> + * weights, so that if group does not loose all if it was not
>> + * continously backlogged.
>> + */
>> + st = cfqe->service_tree;
>> + n = rb_last(&st->rb);
>> + if (n) {
>> + entity = rb_entry_entity(n);
>> + cfqe->vdisktime = entity->vdisktime +
>> + CFQ_IDLE_DELAY;
>> + } else
>> + cfqe->vdisktime = st->min_vdisktime;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + cfq_entity_service_tree_add(st, cfqe);
>> + cfqe = cfqe->parent;
>> + __cfqg = cfqg_of_entity(cfqe);
>> + __cfqg->nr_subgp++;
>> + }
>> }
>>
>> static void
>> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> cfq_rb_erase(&cfqe->rb_node, st);
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> @@ -937,27 +957,47 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> __cfq_entity_service_tree_del(st, cfqe);
>> st->total_weight -= cfqe->weight;
>> - cfqe->service_tree = NULL;
>> }
>> }
>>
>> static void
>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_group *__cfqg, *p_cfqg;
>>
>> BUG_ON(cfqg->nr_cfqq < 1);
>> cfqg->nr_cfqq--;
>>
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>> +
>> /* If there are other cfq queues under this group, don't delete it */
>> if (cfqg->nr_cfqq)
>> return;
>>
>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> - cfq_entity_service_tree_del(st, cfqe);
>> - cfqg->saved_workload_slice = 0;
>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> + /* If child group exists, don't dequeue it */
>> + if (cfqg->nr_subgp)
>> + return;
>> +
>> + /*
>> + * Dequeue this group and its ancestors from their service tree.
>> + */
>> + while (cfqe && cfqe->parent) {
>> + __cfqg = cfqg_of_entity(cfqe);
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> + __cfqg->saved_workload_slice = 0;
>> + cfqe = cfqe->parent;
>> + p_cfqg->nr_subgp--;
>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> + return;
>> + }
>> }
>
> I think one you merge queue/group algorithms, you can use same function
> for adding/deleting queue/group entities and don't have to use separate
> functions for groups?
The CFQ entity adding/deleting for queue/group are almost the same, and I'v
already extract common function to handle it.
cfq_entity_service_tree_add() and cfq_entity_service_tree_del()
>
> [..]
>> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct blkio_cgroup *p_blkcg;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + struct cfq_group *cfqg, *p_cfqg;
>> + void *key = cfqd;
>> + int ret;
>>
>> - /* Add group on cfqd list */
>> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg) {
>> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + }
>> + /* chain has already been built */
>> + return 0;
>> + }
>> +
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> + if (!cfqg)
>> + return -1;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> +
>> + /* Already to the top group */
>> + if (!cgroup->parent)
>> + return 0;
>> +
>> + /* Allocate CFQ groups on the chain */
>> + ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>
> Can you avoid recursion and user for/while loops to initialize the
> chain. Don't want to push multiple stack frames in case of a deep hier.
>
OK, will change.
>> + if (ret == -1) {
>> + uninit_cfqg(cfqd, cfqg);
>> + return -1;
>> + }
>> +
>> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> + BUG_ON(p_cfqg == NULL);
>> +
>> + cfqg_set_parent(cfqd, cfqg, p_cfqg);
>> + return 0;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + int ret;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>> + if (cfqg || !create)
>> + goto done;
>> +
>> + /*
>> + * For hierarchical cfq group scheduling, we need to allocate
>> + * the whole cfq group chain.
>> + */
>> + ret = cfqg_chain_alloc(cfqd, cgroup);
>> + if (!ret) {
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + BUG_ON(cfqg == NULL);
>> + goto done;
>> + }
>>
>> done:
>> return cfqg;
>> @@ -1140,12 +1278,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st;
>> int i, j;
>> + struct cfq_entity *cfqe;
>> + struct cfq_group *p_cfqg;
>>
>> BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> if (!atomic_dec_and_test(&cfqg->ref))
>> return;
>> for_each_cfqg_st(cfqg, i, j, st)
>> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> + cfqe = &cfqg->cfqe;
>> + if (cfqe->parent) {
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + /* Drop the reference taken by children */
>> + atomic_dec(&p_cfqg->ref);
>> + }
>> +
>
> Is it the right way to free up whole of the parent chain. just think that
> in a hier of test1->test2->test3 somebody drops the reference to test3
> and test1 and test2 don't have any other children. In that case after
> freeing up test3, we should be freeing up test2 and test1 also.
>
> I was thiking that we can achieve this by freeing up groups in a
> loop.
>
> do {
> cfqe = cfqg->entity.parent;
> if (!atomic_dec_and_test(&cfqg->ref))
> return;
> kfree(cfqg);
> cfqg = cfqg_of_entity(cfqe);
> } while(cfqg)
>
OK
>> kfree(cfqg);
>> }
>>
>> @@ -1358,8 +1506,6 @@ insert:
>> /* Add cfqq onto service tree. */
>> cfq_entity_service_tree_add(service_tree, cfqe);
>>
>> - update_min_vdisktime(service_tree);
>> - cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1802,28 +1948,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +static struct cfq_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_entity *entity;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> if (!cfqd->rq_queued)
>> return NULL;
>>
>> - cfqg = cfq_get_next_cfqg(cfqd);
>> - if (!cfqg)
>> - return NULL;
>> -
>> for_each_cfqg_st(cfqg, i, j, st) {
>> - cfqe = cfq_rb_first(st);
>> - if (cfqe != NULL)
>> - return cfqq_of_entity(cfqe);
>> + entity = cfq_rb_first(st);
>> +
>> + if (entity && !entity->is_group_entity)
>> + return entity;
>> + else if (entity && entity->is_group_entity) {
>> + cfqg = cfqg_of_entity(entity);
>> + return cfq_get_next_entity_forced(cfqd, cfqg);
>> + }
>> }
>> return NULL;
>> }
>
> Can above be siplified by just taking cfqd as parameter. Will work both
> for hierarchical and flat mode. Wanted to avoid recursion as somebody
> can create deep cgroup hierarchy and push lots of frames on stack.
Ok, will change.
>
> struct cfq_entity *cfq_get_next_entity_forced(struct cfq_data *cfqd)
> {
> struct service_tree *st = cfqd->grp_service_tree;
>
> do {
> cfqe = cfq_rb_first(st);
> if (is_cfqe_cfqq(cfqe))
> return cfqe;
> st = choose_service_tree_forced(cfqg);
> } while (st);
> }
>
> And choose_service_tree_forced() can be something like.
>
> choose_service_tree_forced(cfqg) {
> for_each_cfqg_st() {
> if (st->count !=0)
> return st;
> }
> }
>
>>
>> +
>> /*
>> * Get and set a new active queue for service.
>> */
>> @@ -2179,7 +2327,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> struct cfq_entity *cfqe;
>> - struct cfq_queue *cfqq;
>> unsigned long lowest_start_time;
>> int i;
>> bool time_valid = false;
>> @@ -2191,10 +2338,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> cfqe = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - cfqq = cfqq_of_entity(cfqe);
>> if (cfqe && (!time_valid ||
>> - cfqq->reposition_time < lowest_start_time)) {
>> - lowest_start_time = cfqq->reposition_time;
>> + cfqe->reposition_time < lowest_start_time)) {
>> + lowest_start_time = cfqe->reposition_time;
>> cur_best = i;
>> time_valid = true;
>> }
>> @@ -2203,47 +2349,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> return cur_best;
>> }
>>
>> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> unsigned slice;
>> unsigned count;
>> struct cfq_rb_root *st;
>> unsigned group_slice;
>>
>> - if (!cfqg) {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /* Choose next priority. RT > BE > IDLE */
>> - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = RT_WORKLOAD;
>> - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = BE_WORKLOAD;
>> - else {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /*
>> - * For RT and BE, we have to choose also the type
>> - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> - * expiration time
>> - */
>> - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> - count = st->count;
>> -
>> - /*
>> - * check workload expiration, and that we still have other queues ready
>> - */
>> - if (count && !time_after(jiffies, cfqd->workload_expires))
>> - return;
>> -
>> - /* otherwise select new workload type */
>> - cfqd->serving_type =
>> - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> count = st->count;
>>
>> @@ -2284,26 +2396,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> cfqd->workload_expires = jiffies + slice;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st;
>> + unsigned count;
>>
>> - if (RB_EMPTY_ROOT(&st->rb))
>> - return NULL;
>> - cfqe = cfq_rb_first_entity(st);
>> - cfqg = cfqg_of_entity(cfqe);
>> - BUG_ON(!cfqg);
>> - update_min_vdisktime(st);
>> - return cfqg;
>> + if (!cfqg) {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>
> I am wondering where do we use above code. Do we ever call
> choose_service_tree() with cfqg == NULL? Can't seem to figure out by
> looking at the code.
Already fixed by a seperate patch. ;)
>
>> +
>> + /* Choose next priority. RT > BE > IDLE */
>> + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = RT_WORKLOAD;
>> + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = BE_WORKLOAD;
>> + else {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>> +
>> + /*
>> + * For RT and BE, we have to choose also the type
>> + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> + * expiration time
>> + */
>> + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> + count = st->count;
>> +
>> + /*
>> + * check workload expiration, and that we still have other queues ready
>> + */
>> + if (count && !time_after(jiffies, cfqd->workload_expires))
>> + return;
>> +
>> + /* otherwise select new workload type */
>> + cfqd->serving_type =
>> + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> }
>>
>> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> +struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
>> + struct cfq_group *cfqg)
>
> I think for this function you don't have to pass cfqg as parameter. You
> can just use cfqd as parameter and then take all decisions based on
> service tree.
>
> So at top we can continue to have grp_service_tree in cfqd. In
> hierarchical mode it will only have root group queued there and in
> flat mode it can have multiple groups queued.
>
> Also I am looking forward to simplifying and organizing CFQ code little
> better so that it is easy to read. Can chooser_serving entity function
> be organized something as follows. This shoudl work both for flat and
> hierarchical modes. Following is only a pesudo code.
>
> struct cfq_entity *select_entity(struct cfq_data *cfqd)
> {
> struct cfq_rb_root *st = cfqd->grp_service_tree;
> struct cfq_entity *cfqe;
>
> do {
> cfqe = cfq_rb_first(st);
> if (is_cfqe_cfqq(cfqe))
> /* We found the next queue to dispatch from */
> break;
> else
> st = choose_service_tree();
> } while (st)
>
> return cfqe;
> }
>
Ok, I'll refine the code to make it easier to read.
>> {
>> - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
>> -
>> - cfqd->serving_group = cfqg;
>> + struct cfq_rb_root *service_tree;
>>
>> /* Restore the workload type data */
>> if (cfqg->saved_workload_slice) {
>> @@ -2314,8 +2451,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> cfqd->workload_expires = jiffies - 1;
>>
>> choose_service_tree(cfqd, cfqg);
>> -}
>>
>> + service_tree = service_tree_for(cfqg, cfqd->serving_prio,
>> + cfqd->serving_type);
>> +
>> + if (!cfqd->rq_queued)
>> + return NULL;
>> +
>> + /* There is nothing to dispatch */
>> + if (!service_tree)
>> + return NULL;
>> + if (RB_EMPTY_ROOT(&service_tree->rb))
>> + return NULL;
>> +
>> + return cfq_rb_first(service_tree);
>> +}
>> /*
>> * Select a queue for service. If we have a current active queue,
>> * check whether to continue servicing it, or retrieve and set a new one.
>> @@ -2323,6 +2473,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq, *new_cfqq = NULL;
>> + struct cfq_group *cfqg;
>> + struct cfq_entity *entity;
>>
>> cfqq = cfqd->active_queue;
>> if (!cfqq)
>> @@ -2422,8 +2574,23 @@ new_queue:
>> * Current queue expired. Check if we have to switch to a new
>> * service tree
>> */
>> - if (!new_cfqq)
>> - cfq_choose_cfqg(cfqd);
>> + cfqg = &cfqd->root_group;
>> +
>> + if (!new_cfqq) {
>> + do {
>> + entity = choose_serving_entity(cfqd, cfqg);
>> + if (entity && !entity->is_group_entity) {
>> + /* This is the CFQ queue that should run */
>> + new_cfqq = cfqq_of_entity(entity);
>> + cfqd->serving_group = cfqg;
>> + set_workload_expire(cfqd, cfqg);
>> + break;
>> + } else if (entity && entity->is_group_entity) {
>> + /* Continue to lookup in this CFQ group */
>> + cfqg = cfqg_of_entity(entity);
>> + }
>> + } while (entity && entity->is_group_entity);
>
> I think move above logic in a separate function otherwise select_queue()
> is becoming complicated.
>
> Secondly for traversing the hierarchy you can introduce macros like
> for_each_entity() or for_each_cfqe() etc.
>
> Thirdly I would again say that flat mode is already supported. Build on
> top of it instead of first getting rid of it and then adding it back
> with the help of a separate patch. If it is too complicated then let
> it be a single patch instead of separating it out in two pathes.
Ok
Thanks for reviewing Vievk, will post a updated patch.
Gui
>
>> + }
>
>>
>> cfqq = cfq_set_active_queue(cfqd, new_cfqq);
>> keep_queue:
>> @@ -2454,10 +2621,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq;
>> int dispatched = 0;
>> + struct cfq_entity *entity;
>> + struct cfq_group *root = &cfqd->root_group;
>>
>> /* Expire the timeslice of the current active queue first */
>> cfq_slice_expired(cfqd, 0);
>> - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
>> + while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
>> + BUG_ON(entity->is_group_entity);
>> + cfqq = cfqq_of_entity(entity);
>> __cfq_set_active_queue(cfqd, cfqq);
>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>> }
>> @@ -3991,9 +4162,6 @@ static void *cfq_init_queue(struct request_queue *q)
>>
>> cfqd->cic_index = i;
>>
>> - /* Init root service tree */
>> - cfqd->grp_service_tree = CFQ_RB_ROOT;
>> -
>> /* Init root group */
>> cfqg = &cfqd->root_group;
>> for_each_cfqg_st(cfqg, i, j, st)
>> @@ -4003,6 +4171,7 @@ static void *cfq_init_queue(struct request_queue *q)
>> /* Give preference to root group over other groups */
>> cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> cfqg->cfqe.is_group_entity = true;
>> + cfqg->cfqe.parent = NULL;
>>
>> #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> /*
>> --
>> 1.6.5.2
>>
>>
>
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level
2010-12-15 7:02 ` Gui Jianfeng
@ 2010-12-15 22:04 ` Vivek Goyal
0 siblings, 0 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-15 22:04 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Wed, Dec 15, 2010 at 03:02:36PM +0800, Gui Jianfeng wrote:
[..]
> >> static inline unsigned
> >> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> {
> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> struct cfq_entity *cfqe = &cfqg->cfqe;
> >> + struct cfq_rb_root *st = cfqe->service_tree;
> >>
> >> - return cfq_target_latency * cfqe->weight / st->total_weight;
> >> + if (st)
> >> + return cfq_target_latency * cfqe->weight
> >> + / st->total_weight;
> >
> > Is it still true in hierarchical mode. Previously group used to be
> > at the top and there used to be only one service tree for groups so
> > st->total_weight represented total weight in the system.
> >
> > Now with hierarhcy this will not/should not be true. So a group slice
> > calculation should be different?
>
> I just keep the original group slice calculation here. I was thinking that
> calculate group slice in a hierachical way, this might get a really small
> group slice and not sure how it works. So I just keep the original calculation.
> Any thoughts?
Corrado already had minimum per queue limits (16ms or something) so don't
worry about it getting too small. But we have to do hierarchical groups
share calculation otherwise what's the point of writting this code and
all the logic of trying to meet soft latency of 300ms.
>
> >
> >> + else
> >> + /* If this is the root group, give it a full slice. */
> >> + return cfq_target_latency;
> >> }
> >>
> >> static inline void
> >> @@ -804,17 +809,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> >> return NULL;
> >> }
> >>
> >> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> >> -{
> >> - if (!root->left)
> >> - root->left = rb_first(&root->rb);
> >> -
> >> - if (root->left)
> >> - return rb_entry_entity(root->left);
> >> -
> >> - return NULL;
> >> -}
> >> -
> >> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> >> {
> >> rb_erase(n, root);
> >> @@ -888,12 +882,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >>
> >> rb_link_node(&cfqe->rb_node, parent, node);
> >> rb_insert_color(&cfqe->rb_node, &st->rb);
> >> +
> >> + update_min_vdisktime(st);
> >> }
> >>
> >> static void
> >> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >> {
> >> __cfq_entity_service_tree_add(st, cfqe);
> >> + cfqe->reposition_time = jiffies;
> >> st->count++;
> >> st->total_weight += cfqe->weight;
> >> }
> >> @@ -901,34 +898,57 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> >> static void
> >> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> {
> >> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> struct cfq_entity *cfqe = &cfqg->cfqe;
> >> - struct cfq_entity *__cfqe;
> >> struct rb_node *n;
> >> + struct cfq_entity *entity;
> >> + struct cfq_rb_root *st;
> >> + struct cfq_group *__cfqg;
> >>
> >> cfqg->nr_cfqq++;
> >> +
> >> + /*
> >> + * Root group doesn't belongs to any service
> >> + */
> >> + if (cfqg == &cfqd->root_group)
> >> + return;
> >
> > Can we keep root group on cfqd->grp_service_tree? In hierarchical mode
> > there will be only 1 group on grp service tree and in flat mode there
> > can be many.
>
> Keep top service tree different for hierarchical mode and flat mode is just
> fine to me. If you don't strongly object, I'd to keep the current way. :)
I am saying that keep one top tree both for hierarchical and flat mode and
not separate trees.
for flat mode everything goes on cfqd->grp_service_tree.
grp_service_tree
/ | \
root test1 test2
for hierarchical mode it will look as follows.
grp_service_tree
|
root
/ \
test1 test2
Or it could looks as follows if user has set use_hier=1 in test2 only.
grp_service_tree
| | |
root test1 test2
|
test3
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
` (4 preceding siblings ...)
2010-12-13 1:45 ` [PATCH 5/8 v2] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level Gui Jianfeng
@ 2010-12-13 1:45 ` Gui Jianfeng
2010-12-15 21:26 ` Vivek Goyal
2010-12-13 1:45 ` [PATCH 7/8] cfq-iosched: Add flat mode and switch between two modes by "use_hierarchy" Gui Jianfeng
2010-12-13 1:45 ` [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy Gui Jianfeng
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:45 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
This patch adds "use_hierarchy" in Root CGroup with out any functionality.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/blk-cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
block/blk-cgroup.h | 5 +++-
block/cfq-iosched.c | 24 +++++++++++++++++
3 files changed, 97 insertions(+), 4 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..9747ebb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -25,7 +25,10 @@
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+struct blkio_cgroup blkio_root_cgroup = {
+ .weight = 2*BLKIO_WEIGHT_DEFAULT,
+ .use_hierarchy = 1,
+ };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -1385,10 +1388,73 @@ struct cftype blkio_files[] = {
#endif
};
+static u64 blkiocg_use_hierarchy_read(struct cgroup *cgroup,
+ struct cftype *cftype)
+{
+ struct blkio_cgroup *blkcg;
+
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ return (u64)blkcg->use_hierarchy;
+}
+
+static int
+blkiocg_use_hierarchy_write(struct cgroup *cgroup,
+ struct cftype *cftype, u64 val)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+
+ if (val > 1 || !list_empty(&cgroup->children))
+ return -EINVAL;
+
+ if (blkcg->use_hierarchy == val)
+ return 0;
+
+ spin_lock(&blkio_list_lock);
+ blkcg->use_hierarchy = val;
+
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ list_for_each_entry(blkiop, &blkio_list, list) {
+ /*
+ * If this policy does not own the blkg, do not change
+ * cfq group scheduling mode.
+ */
+ if (blkiop->plid != blkg->plid)
+ continue;
+
+ if (blkiop->ops.blkio_update_use_hierarchy_fn)
+ blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
+ val);
+ }
+ }
+ spin_unlock(&blkio_list_lock);
+ return 0;
+}
+
+static struct cftype blkio_use_hierarchy = {
+ .name = "use_hierarchy",
+ .read_u64 = blkiocg_use_hierarchy_read,
+ .write_u64 = blkiocg_use_hierarchy_write,
+};
+
static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- return cgroup_add_files(cgroup, subsys, blkio_files,
- ARRAY_SIZE(blkio_files));
+ int ret;
+
+ ret = cgroup_add_files(cgroup, subsys, blkio_files,
+ ARRAY_SIZE(blkio_files));
+ if (ret)
+ return ret;
+
+ /* use_hierarchy is in root cgroup only. */
+ if (!cgroup->parent)
+ ret = cgroup_add_file(cgroup, subsys, &blkio_use_hierarchy);
+
+ return ret;
}
static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861b..c8caf4e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -105,6 +105,7 @@ enum blkcg_file_name_throtl {
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
+ bool use_hierarchy;
spinlock_t lock;
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
@@ -200,7 +201,8 @@ typedef void (blkio_update_group_read_iops_fn) (void *key,
struct blkio_group *blkg, unsigned int read_iops);
typedef void (blkio_update_group_write_iops_fn) (void *key,
struct blkio_group *blkg, unsigned int write_iops);
-
+typedef void (blkio_update_use_hierarchy_fn) (struct blkio_group *blkg,
+ bool val);
struct blkio_policy_ops {
blkio_unlink_group_fn *blkio_unlink_group_fn;
blkio_update_group_weight_fn *blkio_update_group_weight_fn;
@@ -208,6 +210,7 @@ struct blkio_policy_ops {
blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
+ blkio_update_use_hierarchy_fn *blkio_update_use_hierarchy_fn;
};
struct blkio_policy_type {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d90627e..08323f5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -192,6 +192,9 @@ struct cfq_group {
/* cfq group sched entity */
struct cfq_entity cfqe;
+ /* parent cfq_data */
+ struct cfq_data *cfqd;
+
/* number of cfqq currently on this group */
int nr_cfqq;
@@ -235,6 +238,9 @@ struct cfq_data {
struct request_queue *queue;
struct cfq_group root_group;
+ /* cfq group schedule in flat or hierarchy manner. */
+ bool use_hierarchy;
+
/*
* The priority currently being served
*/
@@ -1091,6 +1097,15 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}
+void
+cfq_update_blkio_use_hierarchy(struct blkio_group *blkg, bool val)
+{
+ struct cfq_group *cfqg;
+
+ cfqg = cfqg_of_blkg(blkg);
+ cfqg->cfqd->use_hierarchy = val;
+}
+
static void init_cfqe(struct blkio_cgroup *blkcg,
struct cfq_group *cfqg)
{
@@ -1121,6 +1136,9 @@ static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
*/
atomic_set(&cfqg->ref, 1);
+ /* Setup cfq data for cfq group */
+ cfqg->cfqd = cfqd;
+
/* Add group onto cgroup list */
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
@@ -4164,6 +4182,7 @@ static void *cfq_init_queue(struct request_queue *q)
/* Init root group */
cfqg = &cfqd->root_group;
+ cfqg->cfqd = cfqd;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
@@ -4224,6 +4243,10 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_latency = 1;
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
+
+ /* hierarchical scheduling for cfq group by default */
+ cfqd->use_hierarchy = 1;
+
/*
* we optimistically start assuming sync ops weren't delayed in last
* second, in order to have larger depth for async operations.
@@ -4386,6 +4409,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
.ops = {
.blkio_unlink_group_fn = cfq_unlink_blkio_group,
.blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
+ .blkio_update_use_hierarchy_fn = cfq_update_blkio_use_hierarchy,
},
.plid = BLKIO_POLICY_PROP,
};
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
2010-12-13 1:45 ` [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality Gui Jianfeng
@ 2010-12-15 21:26 ` Vivek Goyal
2010-12-16 2:42 ` Gui Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Vivek Goyal @ 2010-12-15 21:26 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:45:07AM +0800, Gui Jianfeng wrote:
> This patch adds "use_hierarchy" in Root CGroup with out any functionality.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> block/blk-cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> block/blk-cgroup.h | 5 +++-
> block/cfq-iosched.c | 24 +++++++++++++++++
> 3 files changed, 97 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 455768a..9747ebb 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -25,7 +25,10 @@
> static DEFINE_SPINLOCK(blkio_list_lock);
> static LIST_HEAD(blkio_list);
>
> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
> +struct blkio_cgroup blkio_root_cgroup = {
> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
> + .use_hierarchy = 1,
Currently flat mode is the default. Lets not change the default. So lets
start with use_hierarchy = 0.
Secondly, why don't you make it per cgroup something along the lines of
memory controller where one can start the hierarchy lower in the cgroup
chain and not necessarily at the root. This way we can avoid some
accounting overhead for all the groups which are non-hierarchical.
> + };
> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>
> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> @@ -1385,10 +1388,73 @@ struct cftype blkio_files[] = {
> #endif
> };
>
> +static u64 blkiocg_use_hierarchy_read(struct cgroup *cgroup,
> + struct cftype *cftype)
> +{
> + struct blkio_cgroup *blkcg;
> +
> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> + return (u64)blkcg->use_hierarchy;
> +}
> +
> +static int
> +blkiocg_use_hierarchy_write(struct cgroup *cgroup,
> + struct cftype *cftype, u64 val)
> +{
> + struct blkio_cgroup *blkcg;
> + struct blkio_group *blkg;
> + struct hlist_node *n;
> + struct blkio_policy_type *blkiop;
> +
> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> +
> + if (val > 1 || !list_empty(&cgroup->children))
> + return -EINVAL;
> +
> + if (blkcg->use_hierarchy == val)
> + return 0;
> +
> + spin_lock(&blkio_list_lock);
> + blkcg->use_hierarchy = val;
> +
> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
> + list_for_each_entry(blkiop, &blkio_list, list) {
> + /*
> + * If this policy does not own the blkg, do not change
> + * cfq group scheduling mode.
> + */
> + if (blkiop->plid != blkg->plid)
> + continue;
> +
> + if (blkiop->ops.blkio_update_use_hierarchy_fn)
> + blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
> + val);
Should we really allow this? I mean allow changing hierarchy of a group
when there are already children groups. I think memory controller does
not allow this. We can design along the same lines. Keep use_hierarchy
as 0 by default. Allow changing it only if there are no children cgroups.
Otherwise we shall have to send notifications to subscribing policies
and then change their structure etc. Lets keep it simple.
I was playing with a use_hierarhcy patch for throttling and parts have been
copied from memory controller. I am attaching that with the mail and see if
you can make that working.
---
block/blk-cgroup.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
block/blk-cgroup.h | 2 +
2 files changed, 60 insertions(+), 1 deletion(-)
Index: linux-2.6/block/blk-cgroup.c
===================================================================
--- linux-2.6.orig/block/blk-cgroup.c 2010-11-19 10:30:27.129704770 -0500
+++ linux-2.6/block/blk-cgroup.c 2010-11-19 10:30:29.885671705 -0500
@@ -1214,6 +1214,39 @@ static int blkio_weight_write(struct blk
return 0;
}
+static int blkio_throtl_use_hierarchy_write(struct cgroup *cgrp, u64 val)
+{
+ struct cgroup *parent = cgrp->parent;
+ struct blkio_cgroup *blkcg, *parent_blkcg;
+ int ret = 0;
+
+ if (val != 0 || val != 1)
+ return -EINVAL;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (parent)
+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
+
+ cgroup_lock();
+ /*
+ * If parent's use_hierarchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can
+ * occur, provided the current cgroup has no children.
+ *
+ * For the root cgroup, parent_mem is NULL, we allow value to be
+ * set if there are no children.
+ */
+ if (!parent_blkcg || !parent_blkcg->throtl_use_hier) {
+ if (list_empty(&cgrp->children))
+ blkcg->throtl_use_hier = val;
+ else
+ ret = -EBUSY;
+ } else
+ ret = -EINVAL;
+ cgroup_unlock();
+ return ret;
+}
+
static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
struct blkio_cgroup *blkcg;
enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
@@ -1228,6 +1261,12 @@ static u64 blkiocg_file_read_u64 (struct
return (u64)blkcg->weight;
}
break;
+ case BLKIO_POLICY_THROTL:
+ switch(name) {
+ case BLKIO_THROTL_use_hierarchy:
+ return (u64)blkcg->throtl_use_hier;
+ }
+ break;
default:
BUG();
}
@@ -1250,6 +1289,12 @@ blkiocg_file_write_u64(struct cgroup *cg
return blkio_weight_write(blkcg, val);
}
break;
+ case BLKIO_POLICY_THROTL:
+ switch(name) {
+ case BLKIO_THROTL_use_hierarchy:
+ return blkio_throtl_use_hierarchy_write(cgrp, val);
+ }
+ break;
default:
BUG();
}
@@ -1373,6 +1418,13 @@ struct cftype blkio_files[] = {
BLKIO_THROTL_io_serviced),
.read_map = blkiocg_file_read_map,
},
+ {
+ .name = "throttle.use_hierarchy",
+ .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+ BLKIO_THROTL_use_hierarchy),
+ .read_u64 = blkiocg_file_read_u64,
+ .write_u64 = blkiocg_file_write_u64,
+ },
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -1470,7 +1522,7 @@ static void blkiocg_destroy(struct cgrou
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- struct blkio_cgroup *blkcg;
+ struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
struct cgroup *parent = cgroup->parent;
if (!parent) {
@@ -1483,11 +1535,16 @@ blkiocg_create(struct cgroup_subsys *sub
return ERR_PTR(-ENOMEM);
blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+ parent_blkcg = cgroup_to_blkio_cgroup(parent);
done:
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);
INIT_LIST_HEAD(&blkcg->policy_list);
+ if (parent)
+ blkcg->throtl_use_hier = parent_blkcg->throtl_use_hier;
+ else
+ blkcg->throtl_use_hier = 0;
return &blkcg->css;
}
Index: linux-2.6/block/blk-cgroup.h
===================================================================
--- linux-2.6.orig/block/blk-cgroup.h 2010-11-19 10:15:56.321149940 -0500
+++ linux-2.6/block/blk-cgroup.h 2010-11-19 10:30:29.885671705 -0500
@@ -100,11 +100,13 @@ enum blkcg_file_name_throtl {
BLKIO_THROTL_write_iops_device,
BLKIO_THROTL_io_service_bytes,
BLKIO_THROTL_io_serviced,
+ BLKIO_THROTL_use_hierarchy,
};
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
+ bool throtl_use_hier;
spinlock_t lock;
struct hlist_head blkg_list;
/*
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
2010-12-15 21:26 ` Vivek Goyal
@ 2010-12-16 2:42 ` Gui Jianfeng
2010-12-16 15:44 ` Vivek Goyal
0 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-16 2:42 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:45:07AM +0800, Gui Jianfeng wrote:
>> This patch adds "use_hierarchy" in Root CGroup with out any functionality.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> block/blk-cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>> block/blk-cgroup.h | 5 +++-
>> block/cfq-iosched.c | 24 +++++++++++++++++
>> 3 files changed, 97 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 455768a..9747ebb 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -25,7 +25,10 @@
>> static DEFINE_SPINLOCK(blkio_list_lock);
>> static LIST_HEAD(blkio_list);
>>
>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>> +struct blkio_cgroup blkio_root_cgroup = {
>> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
>> + .use_hierarchy = 1,
>
> Currently flat mode is the default. Lets not change the default. So lets
> start with use_hierarchy = 0.
OK, will do.
>
> Secondly, why don't you make it per cgroup something along the lines of
> memory controller where one can start the hierarchy lower in the cgroup
> chain and not necessarily at the root. This way we can avoid some
> accounting overhead for all the groups which are non-hierarchical.
I'm not sure whether there's a actual use case that needs per cgroup "use_hierarchy".
So for first step, I just give a global "use_hierarchy" in root group. If there're
some actual requirements that need per cgroup "use_hierarchy". We may add the feature
later.
>
>> + };
>> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>> @@ -1385,10 +1388,73 @@ struct cftype blkio_files[] = {
>> #endif
>> };
>>
>> +static u64 blkiocg_use_hierarchy_read(struct cgroup *cgroup,
>> + struct cftype *cftype)
>> +{
>> + struct blkio_cgroup *blkcg;
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + return (u64)blkcg->use_hierarchy;
>> +}
>> +
>> +static int
>> +blkiocg_use_hierarchy_write(struct cgroup *cgroup,
>> + struct cftype *cftype, u64 val)
>> +{
>> + struct blkio_cgroup *blkcg;
>> + struct blkio_group *blkg;
>> + struct hlist_node *n;
>> + struct blkio_policy_type *blkiop;
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> +
>> + if (val > 1 || !list_empty(&cgroup->children))
>> + return -EINVAL;
>> +
>> + if (blkcg->use_hierarchy == val)
>> + return 0;
>> +
>> + spin_lock(&blkio_list_lock);
>> + blkcg->use_hierarchy = val;
>> +
>> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>> + list_for_each_entry(blkiop, &blkio_list, list) {
>> + /*
>> + * If this policy does not own the blkg, do not change
>> + * cfq group scheduling mode.
>> + */
>> + if (blkiop->plid != blkg->plid)
>> + continue;
>> +
>> + if (blkiop->ops.blkio_update_use_hierarchy_fn)
>> + blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
>> + val);
>
> Should we really allow this? I mean allow changing hierarchy of a group
> when there are already children groups. I think memory controller does
> not allow this. We can design along the same lines. Keep use_hierarchy
> as 0 by default. Allow changing it only if there are no children cgroups.
> Otherwise we shall have to send notifications to subscribing policies
> and then change their structure etc. Lets keep it simple.
Yes, I really don't allow changing use_hierarchy if there are childre cgroups.
Please consider following line in my patch.
if (val > 1 || !list_empty(&cgroup->children))
return -EINVAL;
>
> I was playing with a use_hierarhcy patch for throttling and parts have been
> copied from memory controller. I am attaching that with the mail and see if
> you can make that working.
Thanks
Gui
>
> ---
> block/blk-cgroup.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> block/blk-cgroup.h | 2 +
> 2 files changed, 60 insertions(+), 1 deletion(-)
>
> Index: linux-2.6/block/blk-cgroup.c
> ===================================================================
> --- linux-2.6.orig/block/blk-cgroup.c 2010-11-19 10:30:27.129704770 -0500
> +++ linux-2.6/block/blk-cgroup.c 2010-11-19 10:30:29.885671705 -0500
> @@ -1214,6 +1214,39 @@ static int blkio_weight_write(struct blk
> return 0;
> }
>
> +static int blkio_throtl_use_hierarchy_write(struct cgroup *cgrp, u64 val)
> +{
> + struct cgroup *parent = cgrp->parent;
> + struct blkio_cgroup *blkcg, *parent_blkcg;
> + int ret = 0;
> +
> + if (val != 0 || val != 1)
> + return -EINVAL;
> +
> + blkcg = cgroup_to_blkio_cgroup(cgrp);
> + if (parent)
> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
> +
> + cgroup_lock();
> + /*
> + * If parent's use_hierarchy is set, we can't make any modifications
> + * in the child subtrees. If it is unset, then the change can
> + * occur, provided the current cgroup has no children.
> + *
> + * For the root cgroup, parent_mem is NULL, we allow value to be
> + * set if there are no children.
> + */
> + if (!parent_blkcg || !parent_blkcg->throtl_use_hier) {
> + if (list_empty(&cgrp->children))
> + blkcg->throtl_use_hier = val;
> + else
> + ret = -EBUSY;
> + } else
> + ret = -EINVAL;
> + cgroup_unlock();
> + return ret;
> +}
> +
> static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
> struct blkio_cgroup *blkcg;
> enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
> @@ -1228,6 +1261,12 @@ static u64 blkiocg_file_read_u64 (struct
> return (u64)blkcg->weight;
> }
> break;
> + case BLKIO_POLICY_THROTL:
> + switch(name) {
> + case BLKIO_THROTL_use_hierarchy:
> + return (u64)blkcg->throtl_use_hier;
> + }
> + break;
> default:
> BUG();
> }
> @@ -1250,6 +1289,12 @@ blkiocg_file_write_u64(struct cgroup *cg
> return blkio_weight_write(blkcg, val);
> }
> break;
> + case BLKIO_POLICY_THROTL:
> + switch(name) {
> + case BLKIO_THROTL_use_hierarchy:
> + return blkio_throtl_use_hierarchy_write(cgrp, val);
> + }
> + break;
> default:
> BUG();
> }
> @@ -1373,6 +1418,13 @@ struct cftype blkio_files[] = {
> BLKIO_THROTL_io_serviced),
> .read_map = blkiocg_file_read_map,
> },
> + {
> + .name = "throttle.use_hierarchy",
> + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
> + BLKIO_THROTL_use_hierarchy),
> + .read_u64 = blkiocg_file_read_u64,
> + .write_u64 = blkiocg_file_write_u64,
> + },
> #endif /* CONFIG_BLK_DEV_THROTTLING */
>
> #ifdef CONFIG_DEBUG_BLK_CGROUP
> @@ -1470,7 +1522,7 @@ static void blkiocg_destroy(struct cgrou
> static struct cgroup_subsys_state *
> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> {
> - struct blkio_cgroup *blkcg;
> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> struct cgroup *parent = cgroup->parent;
>
> if (!parent) {
> @@ -1483,11 +1535,16 @@ blkiocg_create(struct cgroup_subsys *sub
> return ERR_PTR(-ENOMEM);
>
> blkcg->weight = BLKIO_WEIGHT_DEFAULT;
> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
> done:
> spin_lock_init(&blkcg->lock);
> INIT_HLIST_HEAD(&blkcg->blkg_list);
>
> INIT_LIST_HEAD(&blkcg->policy_list);
> + if (parent)
> + blkcg->throtl_use_hier = parent_blkcg->throtl_use_hier;
> + else
> + blkcg->throtl_use_hier = 0;
> return &blkcg->css;
> }
>
> Index: linux-2.6/block/blk-cgroup.h
> ===================================================================
> --- linux-2.6.orig/block/blk-cgroup.h 2010-11-19 10:15:56.321149940 -0500
> +++ linux-2.6/block/blk-cgroup.h 2010-11-19 10:30:29.885671705 -0500
> @@ -100,11 +100,13 @@ enum blkcg_file_name_throtl {
> BLKIO_THROTL_write_iops_device,
> BLKIO_THROTL_io_service_bytes,
> BLKIO_THROTL_io_serviced,
> + BLKIO_THROTL_use_hierarchy,
> };
>
> struct blkio_cgroup {
> struct cgroup_subsys_state css;
> unsigned int weight;
> + bool throtl_use_hier;
> spinlock_t lock;
> struct hlist_head blkg_list;
> /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
2010-12-16 2:42 ` Gui Jianfeng
@ 2010-12-16 15:44 ` Vivek Goyal
2010-12-17 3:06 ` Gui Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Vivek Goyal @ 2010-12-16 15:44 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Thu, Dec 16, 2010 at 10:42:42AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Mon, Dec 13, 2010 at 09:45:07AM +0800, Gui Jianfeng wrote:
> >> This patch adds "use_hierarchy" in Root CGroup with out any functionality.
> >>
> >> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> >> ---
> >> block/blk-cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >> block/blk-cgroup.h | 5 +++-
> >> block/cfq-iosched.c | 24 +++++++++++++++++
> >> 3 files changed, 97 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> >> index 455768a..9747ebb 100644
> >> --- a/block/blk-cgroup.c
> >> +++ b/block/blk-cgroup.c
> >> @@ -25,7 +25,10 @@
> >> static DEFINE_SPINLOCK(blkio_list_lock);
> >> static LIST_HEAD(blkio_list);
> >>
> >> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
> >> +struct blkio_cgroup blkio_root_cgroup = {
> >> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
> >> + .use_hierarchy = 1,
> >
> > Currently flat mode is the default. Lets not change the default. So lets
> > start with use_hierarchy = 0.
>
> OK, will do.
>
> >
> > Secondly, why don't you make it per cgroup something along the lines of
> > memory controller where one can start the hierarchy lower in the cgroup
> > chain and not necessarily at the root. This way we can avoid some
> > accounting overhead for all the groups which are non-hierarchical.
>
> I'm not sure whether there's a actual use case that needs per cgroup "use_hierarchy".
> So for first step, I just give a global "use_hierarchy" in root group. If there're
> some actual requirements that need per cgroup "use_hierarchy". We may add the feature
> later.
>
I think there is some use case. Currently libvirt creates its own cgroups
for each VM. Depending on what cgroup libvirtd has been placed when it
started, it starts creating cgroups from there. So depending on distro,
one might mount blkio controller at /cgroup/blkio by default and then
libcgroup will create its own cgroups from there.
Now as of today, default is flat so the packages which takes care of
mounting blkio controller, I am not expecting them to suddenly change
default to using hierarchy.
Now if libvirt goes on to create its own cgroups under root cgroup
(/cgroup/blkio), then libvirt can't switch it to hierarchical even if
it wants to as children cgroups have already been created under root
and anyway libvirt is not supposed to control the settings of
use_hierarchy of root group.
So if we allow that a hierarchy can be defined from a child node, then
libvirt can easily do it only for its sub hierarchy.
pivot
/ \
root libvirtd
/ \
vm1 vm2
Here root will have use_hierarchy=0 and libvirtd will have use_hierarchy=1
Secondly, I am beginning to believe that overhead of updating the in
all the group of hierarchy might have significant overhead (though I don't
have the data yet) but you will take blkcg->stats_lock of each cgroup
in the path for each IO completion and CFQ updates so many stats. So
there also it might make sense that let libvirtd set use_hierarchy=1
if it needs to and incur additional overhead but global default will
not be run with use_hierarchy=1. I think libvirtd mounts memory controller
as of today with use_hierarchy=0.
Also I don't think it is lot of extra code to support per cgroup
use_hierarchy. So to me it makes sense to do it right now. I am more
concerned about getting it right now because it is part of user interface.
If we introduce something now and then change it 2 releases town the line
the we are stuck with one more convetntion of use_hiearchy and support it
making life even more complicated.
So I would say that do think through it and it should not be lot of extra
code to support it.
> >
> >> + };
> >> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
> >>
> >> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> >> @@ -1385,10 +1388,73 @@ struct cftype blkio_files[] = {
> >> #endif
> >> };
> >>
> >> +static u64 blkiocg_use_hierarchy_read(struct cgroup *cgroup,
> >> + struct cftype *cftype)
> >> +{
> >> + struct blkio_cgroup *blkcg;
> >> +
> >> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> + return (u64)blkcg->use_hierarchy;
> >> +}
> >> +
> >> +static int
> >> +blkiocg_use_hierarchy_write(struct cgroup *cgroup,
> >> + struct cftype *cftype, u64 val)
> >> +{
> >> + struct blkio_cgroup *blkcg;
> >> + struct blkio_group *blkg;
> >> + struct hlist_node *n;
> >> + struct blkio_policy_type *blkiop;
> >> +
> >> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +
> >> + if (val > 1 || !list_empty(&cgroup->children))
> >> + return -EINVAL;
> >> +
> >> + if (blkcg->use_hierarchy == val)
> >> + return 0;
> >> +
> >> + spin_lock(&blkio_list_lock);
> >> + blkcg->use_hierarchy = val;
> >> +
> >> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
> >> + list_for_each_entry(blkiop, &blkio_list, list) {
> >> + /*
> >> + * If this policy does not own the blkg, do not change
> >> + * cfq group scheduling mode.
> >> + */
> >> + if (blkiop->plid != blkg->plid)
> >> + continue;
> >> +
> >> + if (blkiop->ops.blkio_update_use_hierarchy_fn)
> >> + blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
> >> + val);
> >
> > Should we really allow this? I mean allow changing hierarchy of a group
> > when there are already children groups. I think memory controller does
> > not allow this. We can design along the same lines. Keep use_hierarchy
> > as 0 by default. Allow changing it only if there are no children cgroups.
> > Otherwise we shall have to send notifications to subscribing policies
> > and then change their structure etc. Lets keep it simple.
>
> Yes, I really don't allow changing use_hierarchy if there are childre cgroups.
> Please consider following line in my patch.
>
> if (val > 1 || !list_empty(&cgroup->children))
> return -EINVAL;
If there are no children cgroups, then there can not be any children blkg
and there is no need to send any per blkg notification to each policy?
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
2010-12-16 15:44 ` Vivek Goyal
@ 2010-12-17 3:06 ` Gui Jianfeng
2010-12-17 23:03 ` Vivek Goyal
0 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-17 3:06 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Thu, Dec 16, 2010 at 10:42:42AM +0800, Gui Jianfeng wrote:
>> Vivek Goyal wrote:
>>> On Mon, Dec 13, 2010 at 09:45:07AM +0800, Gui Jianfeng wrote:
>>>> This patch adds "use_hierarchy" in Root CGroup with out any functionality.
>>>>
>>>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>>>> ---
>>>> block/blk-cgroup.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>>>> block/blk-cgroup.h | 5 +++-
>>>> block/cfq-iosched.c | 24 +++++++++++++++++
>>>> 3 files changed, 97 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>> index 455768a..9747ebb 100644
>>>> --- a/block/blk-cgroup.c
>>>> +++ b/block/blk-cgroup.c
>>>> @@ -25,7 +25,10 @@
>>>> static DEFINE_SPINLOCK(blkio_list_lock);
>>>> static LIST_HEAD(blkio_list);
>>>>
>>>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>>>> +struct blkio_cgroup blkio_root_cgroup = {
>>>> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
>>>> + .use_hierarchy = 1,
>>> Currently flat mode is the default. Lets not change the default. So lets
>>> start with use_hierarchy = 0.
>> OK, will do.
>>
>>> Secondly, why don't you make it per cgroup something along the lines of
>>> memory controller where one can start the hierarchy lower in the cgroup
>>> chain and not necessarily at the root. This way we can avoid some
>>> accounting overhead for all the groups which are non-hierarchical.
>> I'm not sure whether there's a actual use case that needs per cgroup "use_hierarchy".
>> So for first step, I just give a global "use_hierarchy" in root group. If there're
>> some actual requirements that need per cgroup "use_hierarchy". We may add the feature
>> later.
>>
>
> I think there is some use case. Currently libvirt creates its own cgroups
> for each VM. Depending on what cgroup libvirtd has been placed when it
> started, it starts creating cgroups from there. So depending on distro,
> one might mount blkio controller at /cgroup/blkio by default and then
> libcgroup will create its own cgroups from there.
>
> Now as of today, default is flat so the packages which takes care of
> mounting blkio controller, I am not expecting them to suddenly change
> default to using hierarchy.
>
> Now if libvirt goes on to create its own cgroups under root cgroup
> (/cgroup/blkio), then libvirt can't switch it to hierarchical even if
> it wants to as children cgroups have already been created under root
> and anyway libvirt is not supposed to control the settings of
> use_hierarchy of root group.
>
> So if we allow that a hierarchy can be defined from a child node, then
> libvirt can easily do it only for its sub hierarchy.
>
> pivot
> / \
> root libvirtd
> / \
> vm1 vm2
>
> Here root will have use_hierarchy=0 and libvirtd will have use_hierarchy=1
>
> Secondly, I am beginning to believe that overhead of updating the in
> all the group of hierarchy might have significant overhead (though I don't
> have the data yet) but you will take blkcg->stats_lock of each cgroup
> in the path for each IO completion and CFQ updates so many stats. So
> there also it might make sense that let libvirtd set use_hierarchy=1
> if it needs to and incur additional overhead but global default will
> not be run with use_hierarchy=1. I think libvirtd mounts memory controller
> as of today with use_hierarchy=0.
>
> Also I don't think it is lot of extra code to support per cgroup
> use_hierarchy. So to me it makes sense to do it right now. I am more
> concerned about getting it right now because it is part of user interface.
> If we introduce something now and then change it 2 releases town the line
> the we are stuck with one more convetntion of use_hiearchy and support it
> making life even more complicated.
>
> So I would say that do think through it and it should not be lot of extra
> code to support it.
>
>>>> + };
>>>> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>>>
>>>> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>>>> @@ -1385,10 +1388,73 @@ struct cftype blkio_files[] = {
>>>> #endif
>>>> };
>>>>
>>>> +static u64 blkiocg_use_hierarchy_read(struct cgroup *cgroup,
>>>> + struct cftype *cftype)
>>>> +{
>>>> + struct blkio_cgroup *blkcg;
>>>> +
>>>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> + return (u64)blkcg->use_hierarchy;
>>>> +}
>>>> +
>>>> +static int
>>>> +blkiocg_use_hierarchy_write(struct cgroup *cgroup,
>>>> + struct cftype *cftype, u64 val)
>>>> +{
>>>> + struct blkio_cgroup *blkcg;
>>>> + struct blkio_group *blkg;
>>>> + struct hlist_node *n;
>>>> + struct blkio_policy_type *blkiop;
>>>> +
>>>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>>>> +
>>>> + if (val > 1 || !list_empty(&cgroup->children))
>>>> + return -EINVAL;
>>>> +
>>>> + if (blkcg->use_hierarchy == val)
>>>> + return 0;
>>>> +
>>>> + spin_lock(&blkio_list_lock);
>>>> + blkcg->use_hierarchy = val;
>>>> +
>>>> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>>>> + list_for_each_entry(blkiop, &blkio_list, list) {
>>>> + /*
>>>> + * If this policy does not own the blkg, do not change
>>>> + * cfq group scheduling mode.
>>>> + */
>>>> + if (blkiop->plid != blkg->plid)
>>>> + continue;
>>>> +
>>>> + if (blkiop->ops.blkio_update_use_hierarchy_fn)
>>>> + blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
>>>> + val);
>>> Should we really allow this? I mean allow changing hierarchy of a group
>>> when there are already children groups. I think memory controller does
>>> not allow this. We can design along the same lines. Keep use_hierarchy
>>> as 0 by default. Allow changing it only if there are no children cgroups.
>>> Otherwise we shall have to send notifications to subscribing policies
>>> and then change their structure etc. Lets keep it simple.
>> Yes, I really don't allow changing use_hierarchy if there are childre cgroups.
>> Please consider following line in my patch.
>>
>> if (val > 1 || !list_empty(&cgroup->children))
>> return -EINVAL;
>
> If there are no children cgroups, then there can not be any children blkg
> and there is no need to send any per blkg notification to each policy?
Firsly, In my patch, per blkg notification only happens on root blkg.
Secondly, root cfqg is put onto "flat_service_tree" in flat mode,
where for hierarchical mode, it don't belongs to anybody. When switching, it
has to inform root cfqg to switch onto or switch off "flat_service_tree".
Anyway, If we're going to put root cfqg onto grp_service_tree regardless of
flat or hierarchical mode, this piece of code can be gone.
Thanks,
Gui
>
> Thanks
> Vivek
>
^ permalink raw reply [flat|nested] 41+ messages in thread* Re: [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality.
2010-12-17 3:06 ` Gui Jianfeng
@ 2010-12-17 23:03 ` Vivek Goyal
0 siblings, 0 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-17 23:03 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Fri, Dec 17, 2010 at 11:06:46AM +0800, Gui Jianfeng wrote:
[..]
> >>>> +static int
> >>>> +blkiocg_use_hierarchy_write(struct cgroup *cgroup,
> >>>> + struct cftype *cftype, u64 val)
> >>>> +{
> >>>> + struct blkio_cgroup *blkcg;
> >>>> + struct blkio_group *blkg;
> >>>> + struct hlist_node *n;
> >>>> + struct blkio_policy_type *blkiop;
> >>>> +
> >>>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> >>>> +
> >>>> + if (val > 1 || !list_empty(&cgroup->children))
> >>>> + return -EINVAL;
> >>>> +
> >>>> + if (blkcg->use_hierarchy == val)
> >>>> + return 0;
> >>>> +
> >>>> + spin_lock(&blkio_list_lock);
> >>>> + blkcg->use_hierarchy = val;
> >>>> +
> >>>> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
> >>>> + list_for_each_entry(blkiop, &blkio_list, list) {
> >>>> + /*
> >>>> + * If this policy does not own the blkg, do not change
> >>>> + * cfq group scheduling mode.
> >>>> + */
> >>>> + if (blkiop->plid != blkg->plid)
> >>>> + continue;
> >>>> +
> >>>> + if (blkiop->ops.blkio_update_use_hierarchy_fn)
> >>>> + blkiop->ops.blkio_update_use_hierarchy_fn(blkg,
> >>>> + val);
> >>> Should we really allow this? I mean allow changing hierarchy of a group
> >>> when there are already children groups. I think memory controller does
> >>> not allow this. We can design along the same lines. Keep use_hierarchy
> >>> as 0 by default. Allow changing it only if there are no children cgroups.
> >>> Otherwise we shall have to send notifications to subscribing policies
> >>> and then change their structure etc. Lets keep it simple.
> >> Yes, I really don't allow changing use_hierarchy if there are childre cgroups.
> >> Please consider following line in my patch.
> >>
> >> if (val > 1 || !list_empty(&cgroup->children))
> >> return -EINVAL;
> >
> > If there are no children cgroups, then there can not be any children blkg
> > and there is no need to send any per blkg notification to each policy?
>
> Firsly, In my patch, per blkg notification only happens on root blkg.
> Secondly, root cfqg is put onto "flat_service_tree" in flat mode,
> where for hierarchical mode, it don't belongs to anybody. When switching, it
> has to inform root cfqg to switch onto or switch off "flat_service_tree".
>
> Anyway, If we're going to put root cfqg onto grp_service_tree regardless of
> flat or hierarchical mode, this piece of code can be gone.
>
Exactly. Keeping everything on grp_service_tree() both for flat and
hierarchical mode will make sure no root group moving around business
and no notifications when use_hierarchy is set.
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 7/8] cfq-iosched: Add flat mode and switch between two modes by "use_hierarchy"
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
` (5 preceding siblings ...)
2010-12-13 1:45 ` [PATCH 6/8] blkio-cgroup: "use_hierarchy" interface without any functionality Gui Jianfeng
@ 2010-12-13 1:45 ` Gui Jianfeng
2010-12-20 19:43 ` Vivek Goyal
2010-12-13 1:45 ` [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy Gui Jianfeng
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:45 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Add flat CFQ group scheduling mode and switch between two modes
by "use_hierarchy". Currently, It works when there's only root
group available.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
block/blk-cgroup.c | 2 +-
block/cfq-iosched.c | 357 +++++++++++++++++++++++++++++++++++++++------------
2 files changed, 276 insertions(+), 83 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9747ebb..baa286b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,7 +27,7 @@ static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = {
.weight = 2*BLKIO_WEIGHT_DEFAULT,
- .use_hierarchy = 1,
+ .use_hierarchy = 0,
};
EXPORT_SYMBOL_GPL(blkio_root_cgroup);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 08323f5..cbd23f6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -241,6 +241,9 @@ struct cfq_data {
/* cfq group schedule in flat or hierarchy manner. */
bool use_hierarchy;
+ /* Service tree for cfq group flat scheduling mode. */
+ struct cfq_rb_root flat_service_tree;
+
/*
* The priority currently being served
*/
@@ -647,12 +650,16 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
struct cfq_entity *cfqe = &cfqg->cfqe;
struct cfq_rb_root *st = cfqe->service_tree;
- if (st)
- return cfq_target_latency * cfqe->weight
- / st->total_weight;
- else
- /* If this is the root group, give it a full slice. */
- return cfq_target_latency;
+ if (cfqd->use_hierarchy) {
+ if (st)
+ return cfq_target_latency * cfqe->weight
+ / st->total_weight;
+ else
+ /* If this is the root group, give it a full slice. */
+ return cfq_target_latency;
+ } else {
+ return cfq_target_latency * cfqe->weight / st->total_weight;
+ }
}
static inline void
@@ -915,24 +922,50 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
/*
* Root group doesn't belongs to any service
*/
- if (cfqg == &cfqd->root_group)
+ if (cfqd->use_hierarchy && cfqg == &cfqd->root_group)
return;
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;
- /*
- * Enqueue this group and its ancestors onto their service tree.
- */
- while (cfqe && cfqe->parent) {
+ if (cfqd->use_hierarchy) {
+ /*
+ * Enqueue this group and its ancestors onto their service
+ * tree.
+ */
+ while (cfqe && cfqe->parent) {
+ if (!RB_EMPTY_NODE(&cfqe->rb_node))
+ return;
+
+ /*
+ * Currently put the group at the end. Later implement
+ * something so that groups get lesser vtime based on
+ * their weights, so that if group does not loose all
+ * if it was not continously backlogged.
+ */
+ st = cfqe->service_tree;
+ n = rb_last(&st->rb);
+ if (n) {
+ entity = rb_entry_entity(n);
+ cfqe->vdisktime = entity->vdisktime +
+ CFQ_IDLE_DELAY;
+ } else
+ cfqe->vdisktime = st->min_vdisktime;
+
+ cfq_entity_service_tree_add(st, cfqe);
+ cfqe = cfqe->parent;
+ __cfqg = cfqg_of_entity(cfqe);
+ __cfqg->nr_subgp++;
+ }
+ } else {
if (!RB_EMPTY_NODE(&cfqe->rb_node))
return;
/*
* Currently put the group at the end. Later implement
- * something so that groups get lesser vtime based on their
- * weights, so that if group does not loose all if it was not
- * continously backlogged.
+ * something so that groups get lesser vtime based on
+ * their weights, so that if group does not loose all
+ * if it was not continously backlogged.
*/
st = cfqe->service_tree;
n = rb_last(&st->rb);
@@ -943,10 +976,11 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
} else
cfqe->vdisktime = st->min_vdisktime;
- cfq_entity_service_tree_add(st, cfqe);
- cfqe = cfqe->parent;
- __cfqg = cfqg_of_entity(cfqe);
- __cfqg->nr_subgp++;
+ /*
+ * For flat mode, all cfq groups schedule on the global service
+ * tree(cfqd->flat_service_tree).
+ */
+ cfq_entity_service_tree_add(cfqe->service_tree, cfqe);
}
}
@@ -975,35 +1009,46 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
- /*
- * Root group doesn't belongs to any service
- */
- if (cfqg == &cfqd->root_group)
- return;
-
/* If there are other cfq queues under this group, don't delete it */
if (cfqg->nr_cfqq)
return;
- /* If child group exists, don't dequeue it */
- if (cfqg->nr_subgp)
- return;
- /*
- * Dequeue this group and its ancestors from their service tree.
- */
- while (cfqe && cfqe->parent) {
- __cfqg = cfqg_of_entity(cfqe);
- p_cfqg = cfqg_of_entity(cfqe->parent);
- cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
- cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
- cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
- __cfqg->saved_workload_slice = 0;
- cfqe = cfqe->parent;
- p_cfqg->nr_subgp--;
- if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ if (cfqd->use_hierarchy) {
+ /*
+ * Root group doesn't belongs to any service
+ */
+ if (cfqg == &cfqd->root_group)
+ return;
+
+ /* If child group exists, don't dequeue it */
+ if (cfqg->nr_subgp)
return;
+
+ /*
+ * Dequeue this group and its ancestors from their service
+ * tree.
+ */
+ while (cfqe && cfqe->parent) {
+ __cfqg = cfqg_of_entity(cfqe);
+ p_cfqg = cfqg_of_entity(cfqe->parent);
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
+ cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
+ __cfqg->saved_workload_slice = 0;
+ cfqe = cfqe->parent;
+ p_cfqg->nr_subgp--;
+ if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
+ return;
+ }
+ } else {
+ /* Dequeued from flat service tree. */
+ cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
+ cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
+ cfqg->saved_workload_slice = 0;
+ cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
+
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -1048,19 +1093,31 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
- /*
- * Update the vdisktime on the whole chain.
- */
- while (cfqe && cfqe->parent) {
- struct cfq_rb_root *st = cfqe->service_tree;
+ if (cfqd->use_hierarchy) {
+ /*
+ * Update the vdisktime on the whole chain.
+ */
+ while (cfqe && cfqe->parent) {
+ struct cfq_rb_root *st = cfqe->service_tree;
- /* Can't update vdisktime while group is on service tree */
- __cfq_entity_service_tree_del(st, cfqe);
+ /*
+ * Can't update vdisktime while group is on service
+ * tree.
+ */
+ __cfq_entity_service_tree_del(st, cfqe);
+ cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
+ __cfq_entity_service_tree_add(st, cfqe);
+ st->count++;
+ cfqe->reposition_time = jiffies;
+ cfqe = cfqe->parent;
+ }
+ } else {
+ /* For flat mode, just charge its self */
+ __cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
- __cfq_entity_service_tree_add(st, cfqe);
- st->count++;
+ __cfq_entity_service_tree_add(cfqe->service_tree, cfqe);
+ cfqe->service_tree->count++;
cfqe->reposition_time = jiffies;
- cfqe = cfqe->parent;
}
@@ -1097,13 +1154,36 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg_of_blkg(blkg)->cfqe.weight = weight;
}
+static int cfq_forced_dispatch(struct cfq_data *cfqd);
+
void
cfq_update_blkio_use_hierarchy(struct blkio_group *blkg, bool val)
{
+ unsigned long flags;
struct cfq_group *cfqg;
+ struct cfq_data *cfqd;
+ struct cfq_entity *cfqe;
+ int nr;
+ /* Get root group here */
cfqg = cfqg_of_blkg(blkg);
+ cfqd = cfqg->cfqd;
+
+ spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+ /* Drain all requests */
+ nr = cfq_forced_dispatch(cfqd);
+
+ cfqe = &cfqg->cfqe;
+
+ if (!val)
+ cfqe->service_tree = &cfqd->flat_service_tree;
+ else
+ cfqe->service_tree = NULL;
+
cfqg->cfqd->use_hierarchy = val;
+
+ spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
}
static void init_cfqe(struct blkio_cgroup *blkcg,
@@ -1164,6 +1244,12 @@ static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfqe = &cfqg->cfqe;
+ if (!p_cfqg) {
+ cfqe->service_tree = &cfqd->flat_service_tree;
+ cfqe->parent = NULL;
+ return;
+ }
+
p_cfqe = &p_cfqg->cfqe;
cfqe->parent = p_cfqe;
@@ -1223,6 +1309,36 @@ int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
return 0;
}
+static struct cfq_group *cfqg_alloc(struct cfq_data *cfqd,
+ struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+ struct cfq_group *cfqg;
+ void *key = cfqd;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg) {
+ if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
+ return cfqg;
+ }
+
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+ if (!cfqg)
+ return NULL;
+
+ init_cfqg(cfqd, blkcg, cfqg);
+
+ cfqg_set_parent(cfqd, cfqg, NULL);
+
+ return cfqg;
+}
+
+
static struct cfq_group *
cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
{
@@ -1242,15 +1358,23 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
if (cfqg || !create)
goto done;
- /*
- * For hierarchical cfq group scheduling, we need to allocate
- * the whole cfq group chain.
- */
- ret = cfqg_chain_alloc(cfqd, cgroup);
- if (!ret) {
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- BUG_ON(cfqg == NULL);
- goto done;
+ if (cfqd->use_hierarchy) {
+ /*
+ * For hierarchical cfq group scheduling, we need to allocate
+ * the whole cfq group chain.
+ */
+ ret = cfqg_chain_alloc(cfqd, cgroup);
+ if (!ret) {
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ BUG_ON(cfqg == NULL);
+ goto done;
+ }
+ } else {
+ /*
+ * For flat cfq group scheduling, we just need to allocate a
+ * single cfq group.
+ */
+ cfqg = cfqg_alloc(cfqd, cgroup);
}
done:
@@ -2484,6 +2608,24 @@ struct cfq_entity *choose_serving_entity(struct cfq_data *cfqd,
return cfq_rb_first(service_tree);
}
+
+
+static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+{
+ struct cfq_rb_root *st = &cfqd->flat_service_tree;
+ struct cfq_group *cfqg;
+ struct cfq_entity *cfqe;
+
+ if (RB_EMPTY_ROOT(&st->rb))
+ return NULL;
+
+ cfqe = cfq_rb_first(st);
+ cfqg = cfqg_of_entity(cfqe);
+ BUG_ON(!cfqg);
+ return cfqg;
+}
+
+
/*
* Select a queue for service. If we have a current active queue,
* check whether to continue servicing it, or retrieve and set a new one.
@@ -2592,22 +2734,41 @@ new_queue:
* Current queue expired. Check if we have to switch to a new
* service tree
*/
- cfqg = &cfqd->root_group;
- if (!new_cfqq) {
- do {
- entity = choose_serving_entity(cfqd, cfqg);
- if (entity && !entity->is_group_entity) {
- /* This is the CFQ queue that should run */
- new_cfqq = cfqq_of_entity(entity);
- cfqd->serving_group = cfqg;
- set_workload_expire(cfqd, cfqg);
- break;
- } else if (entity && entity->is_group_entity) {
- /* Continue to lookup in this CFQ group */
- cfqg = cfqg_of_entity(entity);
- }
- } while (entity && entity->is_group_entity);
+ if (cfqd->use_hierarchy) {
+ cfqg = &cfqd->root_group;
+
+ if (!new_cfqq) {
+ do {
+ entity = choose_serving_entity(cfqd, cfqg);
+ if (entity && !entity->is_group_entity) {
+ /*
+ * This is the CFQ queue that should
+ * run.
+ */
+ new_cfqq = cfqq_of_entity(entity);
+ cfqd->serving_group = cfqg;
+ set_workload_expire(cfqd, cfqg);
+ break;
+ } else if (entity && entity->is_group_entity) {
+ /*
+ * Continue to lookup in this CFQ
+ * group.
+ */
+ cfqg = cfqg_of_entity(entity);
+ }
+ } while (entity && entity->is_group_entity);
+ }
+ } else {
+ /* Select a CFQ group from flat service tree. */
+ cfqg = cfq_get_next_cfqg(cfqd);
+ cfqd->serving_group = cfqg;
+ entity = choose_serving_entity(cfqd, cfqg);
+ if (entity) {
+ BUG_ON(entity->is_group_entity);
+ new_cfqq = cfqq_of_entity(entity);
+ set_workload_expire(cfqd, cfqg);
+ }
}
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
@@ -2631,6 +2792,28 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
return dispatched;
}
+static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+{
+ struct cfq_group *cfqg;
+ struct cfq_entity *cfqe;
+ int i, j;
+ struct cfq_rb_root *st;
+
+ if (!cfqd->rq_queued)
+ return NULL;
+
+ cfqg = cfq_get_next_cfqg(cfqd);
+ if (!cfqg)
+ return NULL;
+
+ for_each_cfqg_st(cfqg, i, j, st) {
+ cfqe = cfq_rb_first(st);
+ if (cfqe != NULL)
+ return cfqq_of_entity(cfqe);
+ }
+ return NULL;
+}
+
/*
* Drain our current requests. Used for barriers and when switching
* io schedulers on-the-fly.
@@ -2644,16 +2827,26 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
/* Expire the timeslice of the current active queue first */
cfq_slice_expired(cfqd, 0);
- while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
- BUG_ON(entity->is_group_entity);
- cfqq = cfqq_of_entity(entity);
- __cfq_set_active_queue(cfqd, cfqq);
- dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+
+ if (cfqd->use_hierarchy) {
+ while ((entity =
+ cfq_get_next_entity_forced(cfqd, root)) != NULL) {
+ BUG_ON(entity->is_group_entity);
+ cfqq = cfqq_of_entity(entity);
+ __cfq_set_active_queue(cfqd, cfqq);
+ dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ }
+ } else {
+ while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+ __cfq_set_active_queue(cfqd, cfqq);
+ dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ }
}
BUG_ON(cfqd->busy_queues);
cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+
return dispatched;
}
@@ -4190,7 +4383,7 @@ static void *cfq_init_queue(struct request_queue *q)
/* Give preference to root group over other groups */
cfqg->cfqe.weight = 2*BLKIO_WEIGHT_DEFAULT;
cfqg->cfqe.is_group_entity = true;
- cfqg->cfqe.parent = NULL;
+ cfqg_set_parent(cfqd, cfqg, NULL);
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
@@ -4244,8 +4437,8 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
- /* hierarchical scheduling for cfq group by default */
- cfqd->use_hierarchy = 1;
+ /* flat scheduling for cfq group by default */
+ cfqd->use_hierarchy = 0;
/*
* we optimistically start assuming sync ops weren't delayed in last
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 7/8] cfq-iosched: Add flat mode and switch between two modes by "use_hierarchy"
2010-12-13 1:45 ` [PATCH 7/8] cfq-iosched: Add flat mode and switch between two modes by "use_hierarchy" Gui Jianfeng
@ 2010-12-20 19:43 ` Vivek Goyal
0 siblings, 0 replies; 41+ messages in thread
From: Vivek Goyal @ 2010-12-20 19:43 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:45:14AM +0800, Gui Jianfeng wrote:
[..]
>
> + /* Service tree for cfq group flat scheduling mode. */
> + struct cfq_rb_root flat_service_tree;
> +
> /*
> * The priority currently being served
> */
> @@ -647,12 +650,16 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> struct cfq_entity *cfqe = &cfqg->cfqe;
> struct cfq_rb_root *st = cfqe->service_tree;
>
> - if (st)
> - return cfq_target_latency * cfqe->weight
> - / st->total_weight;
> - else
> - /* If this is the root group, give it a full slice. */
> - return cfq_target_latency;
> + if (cfqd->use_hierarchy) {
> + if (st)
> + return cfq_target_latency * cfqe->weight
> + / st->total_weight;
> + else
> + /* If this is the root group, give it a full slice. */
> + return cfq_target_latency;
> + } else {
> + return cfq_target_latency * cfqe->weight / st->total_weight;
> + }
> }
Once you have moved the notion of entity and weight of the entity, I think
you can simplify things a bit and come up with a notion of entity slice
in a hieararhy and we can avoid using separate mechanisms for queues and
groups.
There can be multiple ways of doing this and you shall have to see what
is a simple way which works. For queues was keeping a track of average
number of queues and that way estimating the slice length. You can
try keeping track of average number of entities in a group or something
like that. But do think everything now in terms of entities and simplify
the logic a bit.
Thanks
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread
* [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy.
[not found] ` <4D01C6AB.9040807@cn.fujitsu.com>
` (6 preceding siblings ...)
2010-12-13 1:45 ` [PATCH 7/8] cfq-iosched: Add flat mode and switch between two modes by "use_hierarchy" Gui Jianfeng
@ 2010-12-13 1:45 ` Gui Jianfeng
2010-12-13 15:10 ` Vivek Goyal
7 siblings, 1 reply; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-13 1:45 UTC (permalink / raw)
To: Jens Axboe, Vivek Goyal
Cc: Gui Jianfeng, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Document for blkio.use_hierarchy.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
---
Documentation/cgroups/blkio-controller.txt | 58 +++++++++++++++++++---------
1 files changed, 39 insertions(+), 19 deletions(-)
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 4ed7b5c..9c6dc9e 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -91,30 +91,44 @@ Throttling/Upper Limit policy
Hierarchical Cgroups
====================
-- Currently none of the IO control policy supports hierarhical groups. But
- cgroup interface does allow creation of hierarhical cgroups and internally
- IO policies treat them as flat hierarchy.
+- Cgroup interface allows creation of hierarchical cgroups. Currently,
+ internally IO policies are able to treat them as flat hierarchy or
+ hierarchical hierarchy. Both hierarchical bandwidth division and flat
+ bandwidth division are supported. "blkio.use_hierarchy" can be used to
+ switch between flat mode and hierarchical mode.
- So this patch will allow creation of cgroup hierarhcy but at the backend
- everything will be treated as flat. So if somebody created a hierarchy like
- as follows.
+ Consider the following CGroup hierarchy:
- root
- / \
- test1 test2
- |
- test3
+ Root
+ / | \
+ Grp1 Grp2 tsk1
+ / \
+ Grp3 tsk2
- CFQ and throttling will practically treat all groups at same level.
+ If flat mode is enabled, CFQ and throttling will practically treat all
+ groups at the same level.
- pivot
- / | \ \
- root test1 test2 test3
+ Pivot tree
+ / | | \
+ Root Grp1 Grp2 Grp3
+ / |
+ tsk1 tsk2
- Down the line we can implement hierarchical accounting/control support
- and also introduce a new cgroup file "use_hierarchy" which will control
- whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
- This is how memory controller also has implemented the things.
+ If hierarchical mode is enabled, CFQ will treat groups and tasks as the same
+ view in CGroup hierarchy.
+
+ Root
+ / | \
+ Grp1 Grp2 tsk1
+ / \
+ Grp3 tsk2
+
+ Grp1, Grp2 and tsk1 are treated at the same level under Root group. Grp3 and
+ tsk2 are treated at the same level under Grp1. Below is the mapping between
+ task io priority and io weight:
+
+ prio 0 1 2 3 4 5 6 7
+ weight 1000 868 740 612 484 356 228 100
Various user visible config options
===================================
@@ -169,6 +183,12 @@ Proportional weight policy files
dev weight
8:16 300
+- blkio.use_hierarchy
+ - Switch between hierarchical mode and flat mode as stated above.
+ blkio.use_hierarchy == 1 means hierarchical mode is enabled.
+ blkio.use_hierarchy == 0 means flat mode is enabled.
+ The default mode is flat mode.
+
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
--
1.6.5.2
^ permalink raw reply related [flat|nested] 41+ messages in thread* Re: [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy.
2010-12-13 1:45 ` [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy Gui Jianfeng
@ 2010-12-13 15:10 ` Vivek Goyal
2010-12-14 2:52 ` Gui Jianfeng
0 siblings, 1 reply; 41+ messages in thread
From: Vivek Goyal @ 2010-12-13 15:10 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
On Mon, Dec 13, 2010 at 09:45:22AM +0800, Gui Jianfeng wrote:
> Document for blkio.use_hierarchy.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> ---
> Documentation/cgroups/blkio-controller.txt | 58 +++++++++++++++++++---------
> 1 files changed, 39 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
> index 4ed7b5c..9c6dc9e 100644
> --- a/Documentation/cgroups/blkio-controller.txt
> +++ b/Documentation/cgroups/blkio-controller.txt
> @@ -91,30 +91,44 @@ Throttling/Upper Limit policy
>
> Hierarchical Cgroups
> ====================
> -- Currently none of the IO control policy supports hierarhical groups. But
> - cgroup interface does allow creation of hierarhical cgroups and internally
> - IO policies treat them as flat hierarchy.
> +- Cgroup interface allows creation of hierarchical cgroups. Currently,
> + internally IO policies are able to treat them as flat hierarchy or
> + hierarchical hierarchy. Both hierarchical bandwidth division and flat
> + bandwidth division are supported. "blkio.use_hierarchy" can be used to
> + switch between flat mode and hierarchical mode.
>
> - So this patch will allow creation of cgroup hierarhcy but at the backend
> - everything will be treated as flat. So if somebody created a hierarchy like
> - as follows.
> + Consider the following CGroup hierarchy:
>
> - root
> - / \
> - test1 test2
> - |
> - test3
> + Root
> + / | \
> + Grp1 Grp2 tsk1
> + / \
> + Grp3 tsk2
>
> - CFQ and throttling will practically treat all groups at same level.
> + If flat mode is enabled, CFQ and throttling will practically treat all
> + groups at the same level.
>
> - pivot
> - / | \ \
> - root test1 test2 test3
> + Pivot tree
> + / | | \
> + Root Grp1 Grp2 Grp3
> + / |
> + tsk1 tsk2
>
> - Down the line we can implement hierarchical accounting/control support
> - and also introduce a new cgroup file "use_hierarchy" which will control
> - whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
> - This is how memory controller also has implemented the things.
> + If hierarchical mode is enabled, CFQ will treat groups and tasks as the same
> + view in CGroup hierarchy.
> +
> + Root
> + / | \
> + Grp1 Grp2 tsk1
> + / \
> + Grp3 tsk2
> +
> + Grp1, Grp2 and tsk1 are treated at the same level under Root group. Grp3 and
> + tsk2 are treated at the same level under Grp1. Below is the mapping between
> + task io priority and io weight:
> +
> + prio 0 1 2 3 4 5 6 7
> + weight 1000 868 740 612 484 356 228 100
I am curious to know that why did you choose above mappings. Current prio
to slice mapping seems to be.
prio 0 1 2 3 4 5 6 7
slice 180 160 140 120 100 80 60 40
Now with above weights difference between prio 0 and prio 7 will be 10
times as compared to old scheme of 4.5 times. Well then there is
slice offset logic which tries to introduce more service differentation.
anyway, I am not particular about it. Just curious. If it works well, then
it is fine.
>
> Various user visible config options
> ===================================
> @@ -169,6 +183,12 @@ Proportional weight policy files
> dev weight
> 8:16 300
>
> +- blkio.use_hierarchy
> + - Switch between hierarchical mode and flat mode as stated above.
> + blkio.use_hierarchy == 1 means hierarchical mode is enabled.
> + blkio.use_hierarchy == 0 means flat mode is enabled.
> + The default mode is flat mode.
> +
Can you please explicitly mentiond that blkio.use_hierarchy only effects
CFQ and has impact on "throttling" logic as of today. Throttling will
still continue to treat everything as flat.
I am working on making throttling logic hierarchical. It has been going
on kind of slow and expecting it to get ready for 2.6.39.
Vivek
^ permalink raw reply [flat|nested] 41+ messages in thread
* Re: [PATCH 8/8] blkio-cgroup: Document for blkio.use_hierarchy.
2010-12-13 15:10 ` Vivek Goyal
@ 2010-12-14 2:52 ` Gui Jianfeng
0 siblings, 0 replies; 41+ messages in thread
From: Gui Jianfeng @ 2010-12-14 2:52 UTC (permalink / raw)
To: Vivek Goyal
Cc: Jens Axboe, Corrado Zoccolo, Chad Talbott, Nauman Rafique,
Divyesh Shah, linux kernel mailing list
Vivek Goyal wrote:
> On Mon, Dec 13, 2010 at 09:45:22AM +0800, Gui Jianfeng wrote:
>> Document for blkio.use_hierarchy.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
>> ---
>> Documentation/cgroups/blkio-controller.txt | 58 +++++++++++++++++++---------
>> 1 files changed, 39 insertions(+), 19 deletions(-)
>>
>> diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
>> index 4ed7b5c..9c6dc9e 100644
>> --- a/Documentation/cgroups/blkio-controller.txt
>> +++ b/Documentation/cgroups/blkio-controller.txt
>> @@ -91,30 +91,44 @@ Throttling/Upper Limit policy
>>
>> Hierarchical Cgroups
>> ====================
>> -- Currently none of the IO control policy supports hierarhical groups. But
>> - cgroup interface does allow creation of hierarhical cgroups and internally
>> - IO policies treat them as flat hierarchy.
>> +- Cgroup interface allows creation of hierarchical cgroups. Currently,
>> + internally IO policies are able to treat them as flat hierarchy or
>> + hierarchical hierarchy. Both hierarchical bandwidth division and flat
>> + bandwidth division are supported. "blkio.use_hierarchy" can be used to
>> + switch between flat mode and hierarchical mode.
>>
>> - So this patch will allow creation of cgroup hierarhcy but at the backend
>> - everything will be treated as flat. So if somebody created a hierarchy like
>> - as follows.
>> + Consider the following CGroup hierarchy:
>>
>> - root
>> - / \
>> - test1 test2
>> - |
>> - test3
>> + Root
>> + / | \
>> + Grp1 Grp2 tsk1
>> + / \
>> + Grp3 tsk2
>>
>> - CFQ and throttling will practically treat all groups at same level.
>> + If flat mode is enabled, CFQ and throttling will practically treat all
>> + groups at the same level.
>>
>> - pivot
>> - / | \ \
>> - root test1 test2 test3
>> + Pivot tree
>> + / | | \
>> + Root Grp1 Grp2 Grp3
>> + / |
>> + tsk1 tsk2
>>
>> - Down the line we can implement hierarchical accounting/control support
>> - and also introduce a new cgroup file "use_hierarchy" which will control
>> - whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
>> - This is how memory controller also has implemented the things.
>> + If hierarchical mode is enabled, CFQ will treat groups and tasks as the same
>> + view in CGroup hierarchy.
>> +
>> + Root
>> + / | \
>> + Grp1 Grp2 tsk1
>> + / \
>> + Grp3 tsk2
>> +
>> + Grp1, Grp2 and tsk1 are treated at the same level under Root group. Grp3 and
>> + tsk2 are treated at the same level under Grp1. Below is the mapping between
>> + task io priority and io weight:
>> +
>> + prio 0 1 2 3 4 5 6 7
>> + weight 1000 868 740 612 484 356 228 100
>
> I am curious to know that why did you choose above mappings. Current prio
> to slice mapping seems to be.
>
> prio 0 1 2 3 4 5 6 7
> slice 180 160 140 120 100 80 60 40
>
> Now with above weights difference between prio 0 and prio 7 will be 10
> times as compared to old scheme of 4.5 times. Well then there is
> slice offset logic which tries to introduce more service differentation.
> anyway, I am not particular about it. Just curious. If it works well, then
> it is fine.
Currently, Since CFQ queue and CFQ group are treated at the same level, I'd
like to map ioprio to the whole range of io weight. So choose this mapping.
>
>>
>> Various user visible config options
>> ===================================
>> @@ -169,6 +183,12 @@ Proportional weight policy files
>> dev weight
>> 8:16 300
>>
>> +- blkio.use_hierarchy
>> + - Switch between hierarchical mode and flat mode as stated above.
>> + blkio.use_hierarchy == 1 means hierarchical mode is enabled.
>> + blkio.use_hierarchy == 0 means flat mode is enabled.
>> + The default mode is flat mode.
>> +
>
> Can you please explicitly mentiond that blkio.use_hierarchy only effects
> CFQ and has impact on "throttling" logic as of today. Throttling will
> still continue to treat everything as flat.
Sure.
Gui
>
> I am working on making throttling logic hierarchical. It has been going
> on kind of slow and expecting it to get ready for 2.6.39.
>
> Vivek
>
^ permalink raw reply [flat|nested] 41+ messages in thread