* [PATCH 0/4] io-controller: Use names rather than major:minor
@ 2010-03-25 18:04 Chad Talbott
2010-03-25 18:04 ` [PATCH 1/4] blkio_group key change: void * -> request_queue * Chad Talbott
` (4 more replies)
0 siblings, 5 replies; 16+ messages in thread
From: Chad Talbott @ 2010-03-25 18:04 UTC (permalink / raw)
To: vgoyal, jens.axboe
Cc: mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
This stack (which includes Gui's patch for per-device weights),
changes the various blkio.* stats to use the disks' names (sda, sdb,
etc.) rather than major:minor pairs when printing statistics.
Additionally setting a per-device policy can be done via the disk's
name.
This has the side effect of fixing the "root cgroup shows no stats"
problem that Ricky mentioned.
Chad
---
Chad Talbott (4):
blkio_group key change: void * -> request_queue *
Adds an RCU-protected pointer to request_queue that makes it easy to
io-controller: Add a new interface "weight_device" for IO-Controller
Use disk-names to set blkio.weight_device policy
block/blk-cgroup.c | 224 +++++++++++++++++++++++++++++++++++++++++++++---
block/blk-cgroup.h | 23 ++++-
block/blk-sysfs.c | 4 +
block/cfq-iosched.c | 27 ++----
include/linux/blkdev.h | 6 +
5 files changed, 248 insertions(+), 36 deletions(-)
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 1/4] blkio_group key change: void * -> request_queue *
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
@ 2010-03-25 18:04 ` Chad Talbott
2010-03-25 23:25 ` Vivek Goyal
2010-03-25 18:04 ` [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to Chad Talbott
` (3 subsequent siblings)
4 siblings, 1 reply; 16+ messages in thread
From: Chad Talbott @ 2010-03-25 18:04 UTC (permalink / raw)
To: vgoyal, jens.axboe
Cc: mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
Use request_queue to find the blkio_group of a device, rather than a
cfq_data pointer. Additionally, it allows the blk-cgroup code to
depend on the queue pointer itself. This avoids doing lookups for
dev_t and facilitates looking up the device's human-readable name in
later patches.
---
block/blk-cgroup.c | 17 ++++++++---------
block/blk-cgroup.h | 13 +++++++------
block/cfq-iosched.c | 22 ++++++++++------------
3 files changed, 25 insertions(+), 27 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b686ad..917957d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/genhd.h>
#include "blk-cgroup.h"
static DEFINE_SPINLOCK(blkio_list_lock);
@@ -64,12 +65,12 @@ void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, void *key, dev_t dev)
+ struct blkio_group *blkg, struct request_queue *queue, dev_t dev)
{
unsigned long flags;
spin_lock_irqsave(&blkcg->lock, flags);
- rcu_assign_pointer(blkg->key, key);
+ rcu_assign_pointer(blkg->queue, queue);
blkg->blkcg_id = css_id(&blkcg->css);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
spin_unlock_irqrestore(&blkcg->lock, flags);
@@ -117,15 +118,15 @@ out:
EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
/* called under rcu_read_lock(). */
-struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, struct request_queue *queue)
{
struct blkio_group *blkg;
struct hlist_node *n;
- void *__key;
+ struct request_queue *__queue;
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
- __key = blkg->key;
- if (__key == key)
+ __queue = blkg->queue;
+ if (__queue == queue)
return blkg;
}
@@ -243,7 +244,6 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
unsigned long flags;
struct blkio_group *blkg;
- void *key;
struct blkio_policy_type *blkiop;
rcu_read_lock();
@@ -257,7 +257,6 @@ remove_entry:
blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
blkcg_node);
- key = rcu_dereference(blkg->key);
__blkiocg_del_blkio_group(blkg);
spin_unlock_irqrestore(&blkcg->lock, flags);
@@ -272,7 +271,7 @@ remove_entry:
*/
spin_lock(&blkio_list_lock);
list_for_each_entry(blkiop, &blkio_list, list)
- blkiop->ops.blkio_unlink_group_fn(key, blkg);
+ blkiop->ops.blkio_unlink_group_fn(blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
done:
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc204..9e70c03 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,7 @@
*/
#include <linux/cgroup.h>
+#include <linux/blkdev.h>
#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -32,7 +33,7 @@ struct blkio_cgroup {
struct blkio_group {
/* An rcu protected unique identifier for the group */
- void *key;
+ struct request_queue *queue;
struct hlist_node blkcg_node;
unsigned short blkcg_id;
#ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -49,7 +50,7 @@ struct blkio_group {
unsigned long sectors;
};
-typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_unlink_group_fn) (struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
unsigned int weight);
@@ -101,10 +102,10 @@ static inline void blkiocg_update_blkio_group_dequeue_stats(
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, void *key, dev_t dev);
+ struct blkio_group *blkg, struct request_queue *queue, dev_t dev);
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
- void *key);
+ struct request_queue *queue);
void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
unsigned long time, unsigned long sectors);
#else
@@ -113,7 +114,7 @@ static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, void *key, dev_t dev)
+ struct blkio_group *blkg, struct request_queue *queue, dev_t dev)
{
}
@@ -121,7 +122,7 @@ static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
static inline struct blkio_group *
-blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, struct request_queue *queue) { return NULL; }
static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
unsigned long time, unsigned long sectors)
{
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dee9d93..864b39a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -940,13 +940,13 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
{
struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
struct cfq_group *cfqg = NULL;
- void *key = cfqd;
+ struct request_queue *queue = cfqd->queue;
int i, j;
struct cfq_rb_root *st;
struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, queue));
if (cfqg || !create)
goto done;
@@ -969,7 +969,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
/* Add group onto cgroup list */
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ blkiocg_add_blkio_group(blkcg, &cfqg->blkg, cfqd->queue,
MKDEV(major, minor));
/* Add group on cfqd list */
@@ -1021,7 +1021,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
kfree(cfqg);
}
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void cfq_destroy_cfqg(struct cfq_group *cfqg)
{
/* Something wrong if we are trying to remove same group twice */
BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
@@ -1047,7 +1047,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
* cfqg also.
*/
if (!blkiocg_del_blkio_group(&cfqg->blkg))
- cfq_destroy_cfqg(cfqd, cfqg);
+ cfq_destroy_cfqg(cfqg);
}
}
@@ -1065,14 +1065,13 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
* it should not be NULL as even if elevator was exiting, cgroup deltion
* path got to it first.
*/
-void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+void cfq_unlink_blkio_group(struct blkio_group *blkg)
{
unsigned long flags;
- struct cfq_data *cfqd = key;
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
- cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+ spin_lock_irqsave(blkg->queue->queue_lock, flags);
+ cfq_destroy_cfqg(cfqg_of_blkg(blkg));
+ spin_unlock_irqrestore(blkg->queue->queue_lock, flags);
}
#else /* GROUP_IOSCHED */
@@ -3672,8 +3671,7 @@ static void *cfq_init_queue(struct request_queue *q)
* to make sure that cfq_put_cfqg() does not try to kfree root group
*/
atomic_set(&cfqg->ref, 1);
- blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
- 0);
+ blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, q, 0);
#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
2010-03-25 18:04 ` [PATCH 1/4] blkio_group key change: void * -> request_queue * Chad Talbott
@ 2010-03-25 18:04 ` Chad Talbott
2010-03-25 23:02 ` Vivek Goyal
2010-03-25 18:04 ` [PATCH 3/4] io-controller: Add a new interface "weight_device" for IO-Controller Chad Talbott
` (2 subsequent siblings)
4 siblings, 1 reply; 16+ messages in thread
From: Chad Talbott @ 2010-03-25 18:04 UTC (permalink / raw)
To: vgoyal, jens.axboe
Cc: mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
find the gendisk that the queue manages.
---
block/blk-cgroup.c | 10 ++++++----
block/blk-sysfs.c | 4 ++++
include/linux/blkdev.h | 6 ++++++
3 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 917957d..809451f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -179,16 +179,18 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
struct blkio_cgroup *blkcg; \
struct blkio_group *blkg; \
struct hlist_node *n; \
+ struct gendisk *disk; \
\
if (!cgroup_lock_live_group(cgroup)) \
return -ENODEV; \
\
blkcg = cgroup_to_blkio_cgroup(cgroup); \
rcu_read_lock(); \
- hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
- if (blkg->dev) \
- seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
- MINOR(blkg->dev), blkg->__VAR); \
+ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { \
+ disk = rcu_dereference(blkg->queue->disk); \
+ if (disk) \
+ seq_printf(m, "%s %lu\n", disk->disk_name, \
+ blkg->__VAR); \
} \
rcu_read_unlock(); \
cgroup_unlock(); \
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2ae2cb3..14cab6a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -484,6 +484,8 @@ int blk_register_queue(struct gendisk *disk)
if (!q->request_fn)
return 0;
+ rcu_assign_pointer(q->disk, disk);
+
ret = elv_register_queue(q);
if (ret) {
kobject_uevent(&q->kobj, KOBJ_REMOVE);
@@ -505,6 +507,8 @@ void blk_unregister_queue(struct gendisk *disk)
if (q->request_fn)
elv_unregister_queue(q);
+ rcu_assign_pointer(q->disk, NULL);
+
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ebd22db..e531cf2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -441,6 +441,12 @@ struct request_queue
#if defined(CONFIG_BLK_DEV_BSG)
struct bsg_class_device bsg_dev;
#endif
+
+ /*
+ * an RCU-protected pointer that can be NULL when devices are
+ * removed
+ */
+ struct gendisk *disk;
};
#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 3/4] io-controller: Add a new interface "weight_device" for IO-Controller
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
2010-03-25 18:04 ` [PATCH 1/4] blkio_group key change: void * -> request_queue * Chad Talbott
2010-03-25 18:04 ` [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to Chad Talbott
@ 2010-03-25 18:04 ` Chad Talbott
2010-03-25 18:04 ` [PATCH 4/4] Use disk-names to set blkio.weight_device policy Chad Talbott
2010-03-26 1:31 ` [PATCH 0/4] io-controller: Use names rather than major:minor Gui Jianfeng
4 siblings, 0 replies; 16+ messages in thread
From: Chad Talbott @ 2010-03-25 18:04 UTC (permalink / raw)
To: vgoyal, jens.axboe
Cc: mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
This is Gui Jianfeng's original patch.
---
block/blk-cgroup.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++
block/blk-cgroup.h | 10 ++
block/cfq-iosched.c | 2
3 files changed, 251 insertions(+), 1 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 809451f..bf77b99 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,13 @@ static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+static inline void policy_insert_node(struct blkio_cgroup *blkcg,
+ struct blkio_policy_node *pn);
+static inline void policy_delete_node(struct blkio_policy_node *pn);
+static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev);
+
+
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
@@ -154,6 +161,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn;
if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;
@@ -162,7 +170,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ pn = policy_search_node(blkcg, blkg->dev);
+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);
@@ -213,8 +227,227 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif
+static int check_dev_num(dev_t dev)
+{
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int policy_parse_and_set(char *buf, struct blkio_policy_node *newpn)
+{
+ char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+ int ret;
+ unsigned long major, minor, temp;
+ int i = 0;
+ dev_t dev;
+
+ memset(s, 0, sizeof(s));
+
+ while ((p = strsep(&buf, " ")) != NULL) {
+ if (!*p)
+ continue;
+
+ s[i++] = p;
+
+ /* Prevent from inputing too many things */
+ if (i == 3)
+ break;
+ }
+
+ if (i != 2)
+ return -EINVAL;
+
+ p = strsep(&s[0], ":");
+ if (p != NULL)
+ major_s = p;
+ else
+ return -EINVAL;
+
+ minor_s = s[0];
+ if (!minor_s)
+ return -EINVAL;
+
+ ret = strict_strtoul(major_s, 10, &major);
+ if (ret)
+ return -EINVAL;
+
+ ret = strict_strtoul(minor_s, 10, &minor);
+ if (ret)
+ return -EINVAL;
+
+ dev = MKDEV(major, minor);
+
+ ret = check_dev_num(dev);
+ if (ret)
+ return ret;
+
+ newpn->dev = dev;
+
+ if (s[1] == NULL)
+ return -EINVAL;
+
+ ret = strict_strtoul(s[1], 10, &temp);
+ if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+ temp > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ newpn->weight = temp;
+
+ return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ pn = policy_search_node(blkcg, dev);
+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+static inline void policy_insert_node(struct blkio_cgroup *blkcg,
+ struct blkio_policy_node *pn)
+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void policy_delete_node(struct blkio_policy_node *pn)
+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+
+
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;
+ struct blkio_policy_node *newpn, *pn;
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ int keep_newpn = 0;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ buf = kstrdup(buffer, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+ if (!newpn) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ ret = policy_parse_and_set(buf, newpn);
+ if (ret)
+ goto free_newpn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ spin_lock_irq(&blkcg->lock);
+
+ pn = policy_search_node(blkcg, newpn->dev);
+ if (!pn) {
+ if (newpn->weight != 0) {
+ policy_insert_node(blkcg, newpn);
+ keep_newpn = 1;
+ }
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+
+ if (newpn->weight == 0) {
+ /* weight == 0 means deleteing a specific weight */
+ policy_delete_node(pn);
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ pn->weight = newpn->weight;
+
+update_io_group:
+ /* update weight for each cfqg */
+ spin_lock(&blkio_list_lock);
+ spin_lock_irq(&blkcg->lock);
+
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ if (newpn->dev == blkg->dev) {
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_update_group_weight_fn(blkg,
+ newpn->weight ?
+ newpn->weight :
+ blkcg->weight);
+ }
+ }
+
+ spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
+
+free_newpn:
+ if (!keep_newpn)
+ kfree(newpn);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_policy_node *pn;
+
+ seq_printf(m, "dev\tweight\n");
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (list_empty(&blkcg->policy_list))
+ goto out;
+
+ spin_lock_irq(&blkcg->lock);
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+ MINOR(pn->dev), pn->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+out:
+ return 0;
+}
+
struct cftype blkio_files[] = {
{
+ .name = "weight_device",
+ .read_seq_string = blkiocg_weight_device_read,
+ .write_string = blkiocg_weight_device_write,
+ .max_write_len = 256,
+ },
+ {
.name = "weight",
.read_u64 = blkiocg_weight_read,
.write_u64 = blkiocg_weight_write,
@@ -247,6 +480,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
unsigned long flags;
struct blkio_group *blkg;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn, *pntmp;
rcu_read_lock();
remove_entry:
@@ -276,7 +510,12 @@ remove_entry:
blkiop->ops.blkio_unlink_group_fn(blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
+
done:
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+ policy_delete_node(pn);
+ kfree(pn);
+ }
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
if (blkcg != &blkio_root_cgroup)
@@ -307,6 +546,7 @@ done:
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);
+ INIT_LIST_HEAD(&blkcg->policy_list);
return &blkcg->css;
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 9e70c03..92296b8 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -29,6 +29,7 @@ struct blkio_cgroup {
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
+ struct list_head policy_list; /* list of blkio_policy_node */
};
struct blkio_group {
@@ -50,6 +51,15 @@ struct blkio_group {
unsigned long sectors;
};
+struct blkio_policy_node {
+ struct list_head node;
+ dev_t dev;
+ unsigned int weight;
+};
+
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev);
+
typedef void (blkio_unlink_group_fn) (struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
unsigned int weight);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 864b39a..d4d57a3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -954,7 +954,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
if (!cfqg)
goto done;
- cfqg->weight = blkcg->weight;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
@@ -971,6 +970,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
blkiocg_add_blkio_group(blkcg, &cfqg->blkg, cfqd->queue,
MKDEV(major, minor));
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 4/4] Use disk-names to set blkio.weight_device policy
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
` (2 preceding siblings ...)
2010-03-25 18:04 ` [PATCH 3/4] io-controller: Add a new interface "weight_device" for IO-Controller Chad Talbott
@ 2010-03-25 18:04 ` Chad Talbott
2010-03-26 1:31 ` [PATCH 0/4] io-controller: Use names rather than major:minor Gui Jianfeng
4 siblings, 0 replies; 16+ messages in thread
From: Chad Talbott @ 2010-03-25 18:04 UTC (permalink / raw)
To: vgoyal, jens.axboe
Cc: mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
This patch also cleans up a few fields and arguments that are no
longer used once request_queue is the key for both blkio_groups as
well as blkio_policy_nodes.
---
block/blk-cgroup.c | 103 ++++++++++++++-------------------------------------
block/blk-cgroup.h | 6 +--
block/cfq-iosched.c | 9 +---
3 files changed, 35 insertions(+), 83 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bf77b99..086a00d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -28,7 +28,7 @@ static inline void policy_insert_node(struct blkio_cgroup *blkcg,
struct blkio_policy_node *pn);
static inline void policy_delete_node(struct blkio_policy_node *pn);
static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
- dev_t dev);
+ struct request_queue *queue);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
@@ -72,7 +72,7 @@ void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, struct request_queue *queue, dev_t dev)
+ struct blkio_group *blkg, struct request_queue *queue)
{
unsigned long flags;
@@ -85,7 +85,6 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
/* Need to take css reference ? */
cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
#endif
- blkg->dev = dev;
}
EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -172,7 +171,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
blkcg->weight = (unsigned int)val;
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
- pn = policy_search_node(blkcg, blkg->dev);
+ pn = policy_search_node(blkcg, blkg->queue);
if (pn)
continue;
@@ -227,87 +226,39 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif
-static int check_dev_num(dev_t dev)
+static int policy_parse_and_set(char *buf, struct blkio_policy_node *newpn)
{
+ char disk_name[BDEVNAME_SIZE];
+ int weight;
+ int nfields;
int part = 0;
struct gendisk *disk;
-
- disk = get_gendisk(dev, &part);
- if (!disk || part)
- return -ENODEV;
-
- return 0;
-}
-
-static int policy_parse_and_set(char *buf, struct blkio_policy_node *newpn)
-{
- char *s[4], *p, *major_s = NULL, *minor_s = NULL;
- int ret;
- unsigned long major, minor, temp;
- int i = 0;
dev_t dev;
- memset(s, 0, sizeof(s));
-
- while ((p = strsep(&buf, " ")) != NULL) {
- if (!*p)
- continue;
-
- s[i++] = p;
-
- /* Prevent from inputing too many things */
- if (i == 3)
- break;
- }
-
- if (i != 2)
+ nfields = sscanf(buf, "%s %d", disk_name, &weight);
+ if (nfields != 2)
return -EINVAL;
- p = strsep(&s[0], ":");
- if (p != NULL)
- major_s = p;
- else
- return -EINVAL;
-
- minor_s = s[0];
- if (!minor_s)
- return -EINVAL;
-
- ret = strict_strtoul(major_s, 10, &major);
- if (ret)
- return -EINVAL;
-
- ret = strict_strtoul(minor_s, 10, &minor);
- if (ret)
- return -EINVAL;
-
- dev = MKDEV(major, minor);
-
- ret = check_dev_num(dev);
- if (ret)
- return ret;
-
- newpn->dev = dev;
-
- if (s[1] == NULL)
- return -EINVAL;
+ dev = blk_lookup_devt(disk_name, 0);
+ if (!dev)
+ return -ENODEV;
- ret = strict_strtoul(s[1], 10, &temp);
- if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
- temp > BLKIO_WEIGHT_MAX)
- return -EINVAL;
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
- newpn->weight = temp;
+ newpn->weight = weight;
+ newpn->queue = disk->queue;
return 0;
}
unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
- dev_t dev)
+ struct request_queue *queue)
{
struct blkio_policy_node *pn;
- pn = policy_search_node(blkcg, dev);
+ pn = policy_search_node(blkcg, queue);
if (pn)
return pn->weight;
else
@@ -329,12 +280,12 @@ static inline void policy_delete_node(struct blkio_policy_node *pn)
/* Must be called with blkcg->lock held */
static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
- dev_t dev)
+ struct request_queue *queue)
{
struct blkio_policy_node *pn;
list_for_each_entry(pn, &blkcg->policy_list, node) {
- if (pn->dev == dev)
+ if (pn->queue == queue)
return pn;
}
@@ -372,7 +323,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
spin_lock_irq(&blkcg->lock);
- pn = policy_search_node(blkcg, newpn->dev);
+ pn = policy_search_node(blkcg, newpn->queue);
if (!pn) {
if (newpn->weight != 0) {
policy_insert_node(blkcg, newpn);
@@ -398,7 +349,7 @@ update_io_group:
spin_lock_irq(&blkcg->lock);
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
- if (newpn->dev == blkg->dev) {
+ if (newpn->queue == blkg->queue) {
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
newpn->weight ?
@@ -423,6 +374,7 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
{
struct blkio_cgroup *blkcg;
struct blkio_policy_node *pn;
+ struct gendisk *disk;
seq_printf(m, "dev\tweight\n");
@@ -430,12 +382,15 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
if (list_empty(&blkcg->policy_list))
goto out;
+ rcu_read_lock();
spin_lock_irq(&blkcg->lock);
list_for_each_entry(pn, &blkcg->policy_list, node) {
- seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
- MINOR(pn->dev), pn->weight);
+ disk = rcu_dereference(pn->queue->disk);
+ if (disk != NULL)
+ seq_printf(m, "%s\t%u\n", disk->disk_name, pn->weight);
}
spin_unlock_irq(&blkcg->lock);
+ rcu_read_unlock();
out:
return 0;
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 92296b8..14177b5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -53,12 +53,12 @@ struct blkio_group {
struct blkio_policy_node {
struct list_head node;
- dev_t dev;
+ struct request_queue *queue;
unsigned int weight;
};
extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
- dev_t dev);
+ struct request_queue *queue);
typedef void (blkio_unlink_group_fn) (struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
@@ -112,7 +112,7 @@ static inline void blkiocg_update_blkio_group_dequeue_stats(
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, struct request_queue *queue, dev_t dev);
+ struct blkio_group *blkg, struct request_queue *queue);
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
struct request_queue *queue);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d4d57a3..39d2048 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -944,7 +944,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
int i, j;
struct cfq_rb_root *st;
struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
- unsigned int major, minor;
cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, queue));
if (cfqg || !create)
@@ -967,10 +966,8 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
atomic_set(&cfqg->ref, 1);
/* Add group onto cgroup list */
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- blkiocg_add_blkio_group(blkcg, &cfqg->blkg, cfqd->queue,
- MKDEV(major, minor));
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ blkiocg_add_blkio_group(blkcg, &cfqg->blkg, cfqd->queue);
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.queue);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -3671,7 +3668,7 @@ static void *cfq_init_queue(struct request_queue *q)
* to make sure that cfq_put_cfqg() does not try to kfree root group
*/
atomic_set(&cfqg->ref, 1);
- blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, q, 0);
+ blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, q);
#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to
2010-03-25 18:04 ` [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to Chad Talbott
@ 2010-03-25 23:02 ` Vivek Goyal
0 siblings, 0 replies; 16+ messages in thread
From: Vivek Goyal @ 2010-03-25 23:02 UTC (permalink / raw)
To: Chad Talbott
Cc: jens.axboe, mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Thu, Mar 25, 2010 at 11:04:28AM -0700, Chad Talbott wrote:
> find the gendisk that the queue manages.
> ---
> block/blk-cgroup.c | 10 ++++++----
> block/blk-sysfs.c | 4 ++++
> include/linux/blkdev.h | 6 ++++++
> 3 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 917957d..809451f 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -179,16 +179,18 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
> struct blkio_cgroup *blkcg; \
> struct blkio_group *blkg; \
> struct hlist_node *n; \
> + struct gendisk *disk; \
> \
> if (!cgroup_lock_live_group(cgroup)) \
> return -ENODEV; \
> \
> blkcg = cgroup_to_blkio_cgroup(cgroup); \
> rcu_read_lock(); \
> - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
> - if (blkg->dev) \
> - seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
> - MINOR(blkg->dev), blkg->__VAR); \
> + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { \
> + disk = rcu_dereference(blkg->queue->disk); \
> + if (disk) \
> + seq_printf(m, "%s %lu\n", disk->disk_name, \
> + blkg->__VAR); \
> } \
> rcu_read_unlock(); \
> cgroup_unlock(); \
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 2ae2cb3..14cab6a 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -484,6 +484,8 @@ int blk_register_queue(struct gendisk *disk)
> if (!q->request_fn)
> return 0;
>
> + rcu_assign_pointer(q->disk, disk);
> +
> ret = elv_register_queue(q);
> if (ret) {
> kobject_uevent(&q->kobj, KOBJ_REMOVE);
> @@ -505,6 +507,8 @@ void blk_unregister_queue(struct gendisk *disk)
> if (q->request_fn)
> elv_unregister_queue(q);
>
> + rcu_assign_pointer(q->disk, NULL);
> +
> kobject_uevent(&q->kobj, KOBJ_REMOVE);
> kobject_del(&q->kobj);
> blk_trace_remove_sysfs(disk_to_dev(disk));
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index ebd22db..e531cf2 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -441,6 +441,12 @@ struct request_queue
> #if defined(CONFIG_BLK_DEV_BSG)
> struct bsg_class_device bsg_dev;
> #endif
> +
> + /*
> + * an RCU-protected pointer that can be NULL when devices are
> + * removed
> + */
> + struct gendisk *disk;
> };
Hi Chad,
Where is the code to make sure gendisk does not go away if blk-cgroup code
is referencing blkg->queue->disk under rcu lock?
Thanks
Vivek
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] blkio_group key change: void * -> request_queue *
2010-03-25 18:04 ` [PATCH 1/4] blkio_group key change: void * -> request_queue * Chad Talbott
@ 2010-03-25 23:25 ` Vivek Goyal
2010-03-26 0:17 ` Chad Talbott
0 siblings, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-03-25 23:25 UTC (permalink / raw)
To: Chad Talbott
Cc: jens.axboe, mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Thu, Mar 25, 2010 at 11:04:23AM -0700, Chad Talbott wrote:
> Use request_queue to find the blkio_group of a device, rather than a
> cfq_data pointer. Additionally, it allows the blk-cgroup code to
> depend on the queue pointer itself. This avoids doing lookups for
> dev_t and facilitates looking up the device's human-readable name in
> later patches.
Hi Chad,
human-readable disk names probably are more convenient. I got few general
concerns.
- Can we change the sysfs interface now. In 2.6.33 kernel we released the
code to export blkio.time and blkio.sectors using device numbers. We
shall have to change those interfaces also to reflect device stats in
terms of device names.
- I had kept void* as the key in blkg object so that it does not make any
assumption about the key. This allowed that any xyz code in kernel can
register with blkio code and implement some kind of IO control policy
and it does not have to instanciate a request queue.
But I am not sure if there will be any subsystems which will really do
that. I am assuming that all the device mapper and md code do
instanciate the request queue? (In case we implement max bw policy
there).
- How do you make sure that request queue does not go away and while
somebody is accessing blkg->queue under rcu read lock? This can happen
while we call unlink code.
blkio_unlink_group_fn().
Because we store the pointer to cfqd as key, we make sure cfqd does not
get deleted before one rcu grace period. (call_rcu). But this gurantees
only that cfqd object is around and not cfqd->queue. So this is a
problem with even today's code.
One solution was to replace call_rcu(&cfqd->rcu) with synchronize_rcu()
but that had made booting very slow on Jens's Nehalem box as some driver
was creating and destroying hundredes of request queue during device
detection.
Another solution was to keep track of number of cfq_groups created and
if there are undestroyed groups then call syncronize_rcu(). That would
make sure that boot up will not slow down and during unlink group path,
request queue will also not go away.
- How do you make sure that gendisk does not go away while q->disk is
being accessed under rcu lock. (Already asked in other thread too.)
These are some quick thoughs. More later...
Thanks
Vivek
> ---
> block/blk-cgroup.c | 17 ++++++++---------
> block/blk-cgroup.h | 13 +++++++------
> block/cfq-iosched.c | 22 ++++++++++------------
> 3 files changed, 25 insertions(+), 27 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 4b686ad..917957d 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -15,6 +15,7 @@
> #include <linux/kdev_t.h>
> #include <linux/module.h>
> #include <linux/err.h>
> +#include <linux/genhd.h>
> #include "blk-cgroup.h"
>
> static DEFINE_SPINLOCK(blkio_list_lock);
> @@ -64,12 +65,12 @@ void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
> EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
>
> void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
> - struct blkio_group *blkg, void *key, dev_t dev)
> + struct blkio_group *blkg, struct request_queue *queue, dev_t dev)
> {
> unsigned long flags;
>
> spin_lock_irqsave(&blkcg->lock, flags);
> - rcu_assign_pointer(blkg->key, key);
> + rcu_assign_pointer(blkg->queue, queue);
> blkg->blkcg_id = css_id(&blkcg->css);
> hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
> spin_unlock_irqrestore(&blkcg->lock, flags);
> @@ -117,15 +118,15 @@ out:
> EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
>
> /* called under rcu_read_lock(). */
> -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
> +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, struct request_queue *queue)
> {
> struct blkio_group *blkg;
> struct hlist_node *n;
> - void *__key;
> + struct request_queue *__queue;
>
> hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
> - __key = blkg->key;
> - if (__key == key)
> + __queue = blkg->queue;
> + if (__queue == queue)
> return blkg;
> }
>
> @@ -243,7 +244,6 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> unsigned long flags;
> struct blkio_group *blkg;
> - void *key;
> struct blkio_policy_type *blkiop;
>
> rcu_read_lock();
> @@ -257,7 +257,6 @@ remove_entry:
>
> blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
> blkcg_node);
> - key = rcu_dereference(blkg->key);
> __blkiocg_del_blkio_group(blkg);
>
> spin_unlock_irqrestore(&blkcg->lock, flags);
> @@ -272,7 +271,7 @@ remove_entry:
> */
> spin_lock(&blkio_list_lock);
> list_for_each_entry(blkiop, &blkio_list, list)
> - blkiop->ops.blkio_unlink_group_fn(key, blkg);
> + blkiop->ops.blkio_unlink_group_fn(blkg);
> spin_unlock(&blkio_list_lock);
> goto remove_entry;
> done:
> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
> index 8ccc204..9e70c03 100644
> --- a/block/blk-cgroup.h
> +++ b/block/blk-cgroup.h
> @@ -14,6 +14,7 @@
> */
>
> #include <linux/cgroup.h>
> +#include <linux/blkdev.h>
>
> #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
>
> @@ -32,7 +33,7 @@ struct blkio_cgroup {
>
> struct blkio_group {
> /* An rcu protected unique identifier for the group */
> - void *key;
> + struct request_queue *queue;
> struct hlist_node blkcg_node;
> unsigned short blkcg_id;
> #ifdef CONFIG_DEBUG_BLK_CGROUP
> @@ -49,7 +50,7 @@ struct blkio_group {
> unsigned long sectors;
> };
>
> -typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
> +typedef void (blkio_unlink_group_fn) (struct blkio_group *blkg);
> typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
> unsigned int weight);
>
> @@ -101,10 +102,10 @@ static inline void blkiocg_update_blkio_group_dequeue_stats(
> extern struct blkio_cgroup blkio_root_cgroup;
> extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
> extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
> - struct blkio_group *blkg, void *key, dev_t dev);
> + struct blkio_group *blkg, struct request_queue *queue, dev_t dev);
> extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
> extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
> - void *key);
> + struct request_queue *queue);
> void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
> unsigned long time, unsigned long sectors);
> #else
> @@ -113,7 +114,7 @@ static inline struct blkio_cgroup *
> cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
>
> static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
> - struct blkio_group *blkg, void *key, dev_t dev)
> + struct blkio_group *blkg, struct request_queue *queue, dev_t dev)
> {
> }
>
> @@ -121,7 +122,7 @@ static inline int
> blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
>
> static inline struct blkio_group *
> -blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
> +blkiocg_lookup_group(struct blkio_cgroup *blkcg, struct request_queue *queue) { return NULL; }
> static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
> unsigned long time, unsigned long sectors)
> {
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index dee9d93..864b39a 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -940,13 +940,13 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> {
> struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> struct cfq_group *cfqg = NULL;
> - void *key = cfqd;
> + struct request_queue *queue = cfqd->queue;
> int i, j;
> struct cfq_rb_root *st;
> struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> unsigned int major, minor;
>
> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, queue));
> if (cfqg || !create)
> goto done;
>
> @@ -969,7 +969,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>
> /* Add group onto cgroup list */
> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> - blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, cfqd->queue,
> MKDEV(major, minor));
>
> /* Add group on cfqd list */
> @@ -1021,7 +1021,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> kfree(cfqg);
> }
>
> -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +static void cfq_destroy_cfqg(struct cfq_group *cfqg)
> {
> /* Something wrong if we are trying to remove same group twice */
> BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
> @@ -1047,7 +1047,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
> * cfqg also.
> */
> if (!blkiocg_del_blkio_group(&cfqg->blkg))
> - cfq_destroy_cfqg(cfqd, cfqg);
> + cfq_destroy_cfqg(cfqg);
> }
> }
>
> @@ -1065,14 +1065,13 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
> * it should not be NULL as even if elevator was exiting, cgroup deltion
> * path got to it first.
> */
> -void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
> +void cfq_unlink_blkio_group(struct blkio_group *blkg)
> {
> unsigned long flags;
> - struct cfq_data *cfqd = key;
>
> - spin_lock_irqsave(cfqd->queue->queue_lock, flags);
> - cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
> - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
> + spin_lock_irqsave(blkg->queue->queue_lock, flags);
> + cfq_destroy_cfqg(cfqg_of_blkg(blkg));
> + spin_unlock_irqrestore(blkg->queue->queue_lock, flags);
> }
>
> #else /* GROUP_IOSCHED */
> @@ -3672,8 +3671,7 @@ static void *cfq_init_queue(struct request_queue *q)
> * to make sure that cfq_put_cfqg() does not try to kfree root group
> */
> atomic_set(&cfqg->ref, 1);
> - blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
> - 0);
> + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, q, 0);
> #endif
> /*
> * Not strictly needed (since RB_ROOT just clears the node and we
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] blkio_group key change: void * -> request_queue *
2010-03-25 23:25 ` Vivek Goyal
@ 2010-03-26 0:17 ` Chad Talbott
0 siblings, 0 replies; 16+ messages in thread
From: Chad Talbott @ 2010-03-26 0:17 UTC (permalink / raw)
To: Vivek Goyal
Cc: jens.axboe, mrubin, guijianfeng, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Thu, Mar 25, 2010 at 4:25 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> human-readable disk names probably are more convenient. I got few general
> concerns.
Thanks for the quick and detailed review.
> - Can we change the sysfs interface now. In 2.6.33 kernel we released the
> code to export blkio.time and blkio.sectors using device numbers. We
> shall have to change those interfaces also to reflect device stats in
> terms of device names.
I wasn't aware of those interface, but I'm all for consistency. I'll
look into it.
> - I had kept void* as the key in blkg object so that it does not make any
> assumption about the key. This allowed that any xyz code in kernel can
> register with blkio code and implement some kind of IO control policy
> and it does not have to instanciate a request queue.
>
> But I am not sure if there will be any subsystems which will really do
> that. I am assuming that all the device mapper and md code do
> instanciate the request queue? (In case we implement max bw policy
> there).
I understand your preference to keep the key generic, but I wonder if
there is *any* device that won't have a request_queue? The penalty of
keeping it very generic might be the complexity of another auxiliary
structure to lookup gendisks or the next thing. I wish the device
mapper folks would speak up and be involved in these discussions.
> - How do you make sure that request queue does not go away and while
> somebody is accessing blkg->queue under rcu read lock? This can happen
> while we call unlink code.
Here I'll admit to being very new to RCU, and I'll defer to your
worries. More homework needed on my part.
> blkio_unlink_group_fn().
>
> Because we store the pointer to cfqd as key, we make sure cfqd does not
> get deleted before one rcu grace period. (call_rcu). But this gurantees
> only that cfqd object is around and not cfqd->queue. So this is a
> problem with even today's code.
>
> ...
> - How do you make sure that gendisk does not go away while q->disk is
> being accessed under rcu lock. (Already asked in other thread too.)
On locking between gendisk deletion and stats printing: I suppose that
you've already considered and discarded using the queue_lock to
protect a gendisk pointer in request_queue?
> These are some quick thoughs. More later...
Thanks again!
Chad
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
` (3 preceding siblings ...)
2010-03-25 18:04 ` [PATCH 4/4] Use disk-names to set blkio.weight_device policy Chad Talbott
@ 2010-03-26 1:31 ` Gui Jianfeng
2010-03-26 15:20 ` Vivek Goyal
4 siblings, 1 reply; 16+ messages in thread
From: Gui Jianfeng @ 2010-03-26 1:31 UTC (permalink / raw)
To: Chad Talbott, vgoyal, jens.axboe
Cc: mrubin, Li Zefan, linux-kernel, dpshah, Nauman Rafique
Chad Talbott wrote:
> This stack (which includes Gui's patch for per-device weights),
> changes the various blkio.* stats to use the disks' names (sda, sdb,
> etc.) rather than major:minor pairs when printing statistics.
Hi Chad,
I have the same concern with Vivek. We'v already exported device number
pair to user since 2.6.33, It's better to keep the original ABI.
> Additionally setting a per-device policy can be done via the disk's
> name.
To keep things simple, can we add new API in block layer to lookup
device name by device number as following.
+int blk_lookup_devname(dev_t devt, char *name)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+ struct gendisk *disk;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ if (dev->devt != devt)
+ continue;
+
+ disk = dev_to_disk(dev);
+
+ disk_name(disk, 0, name);
+
+ return 0;
+ }
+ class_dev_iter_exit(&iter);
+
+ return 1;
+}
+EXPORT_SYMBOL(blk_lookup_devname);
+
So we can keep dev_t in blkio layer, and export to user a device name by calling
this function. Also, we retrive device number by calling blk_lookup_devt().
This change might keep things much simple. Jens, do you have any thoughts?
Thanks,
Gui
>
> This has the side effect of fixing the "root cgroup shows no stats"
> problem that Ricky mentioned.
>
> Chad
>
> ---
>
> Chad Talbott (4):
> blkio_group key change: void * -> request_queue *
> Adds an RCU-protected pointer to request_queue that makes it easy to
> io-controller: Add a new interface "weight_device" for IO-Controller
> Use disk-names to set blkio.weight_device policy
>
>
> block/blk-cgroup.c | 224 +++++++++++++++++++++++++++++++++++++++++++++---
> block/blk-cgroup.h | 23 ++++-
> block/blk-sysfs.c | 4 +
> block/cfq-iosched.c | 27 ++----
> include/linux/blkdev.h | 6 +
> 5 files changed, 248 insertions(+), 36 deletions(-)
>
>
>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-26 1:31 ` [PATCH 0/4] io-controller: Use names rather than major:minor Gui Jianfeng
@ 2010-03-26 15:20 ` Vivek Goyal
2010-03-26 22:54 ` Chad Talbott
0 siblings, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-03-26 15:20 UTC (permalink / raw)
To: Gui Jianfeng
Cc: Chad Talbott, jens.axboe, mrubin, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Fri, Mar 26, 2010 at 09:31:41AM +0800, Gui Jianfeng wrote:
> Chad Talbott wrote:
> > This stack (which includes Gui's patch for per-device weights),
> > changes the various blkio.* stats to use the disks' names (sda, sdb,
> > etc.) rather than major:minor pairs when printing statistics.
>
> Hi Chad,
>
> I have the same concern with Vivek. We'v already exported device number
> pair to user since 2.6.33, It's better to keep the original ABI.
>
> > Additionally setting a per-device policy can be done via the disk's
> > name.
>
> To keep things simple, can we add new API in block layer to lookup
> device name by device number as following.
>
> +int blk_lookup_devname(dev_t devt, char *name)
> +{
> + struct class_dev_iter iter;
> + struct device *dev;
> + struct gendisk *disk;
> +
> + class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
> + while ((dev = class_dev_iter_next(&iter))) {
> + if (dev->devt != devt)
> + continue;
> +
> + disk = dev_to_disk(dev);
> +
> + disk_name(disk, 0, name);
> +
> + return 0;
> + }
> + class_dev_iter_exit(&iter);
> +
> + return 1;
> +}
> +EXPORT_SYMBOL(blk_lookup_devname);
> +
>
> So we can keep dev_t in blkio layer, and export to user a device name by calling
> this function. Also, we retrive device number by calling blk_lookup_devt().
> This change might keep things much simple. Jens, do you have any thoughts?
>
Hi Chad,
I agree with Gui that lets keep the dev_t the core in blkio layer. Keeping
a pointer to gendisk in request queue is becoming little messy.
To me device major and minor numbers are just fine. Looking it up with is
really easy. If you want to put a policy on /dev/sda or just use "stat -c
%t:%T /dev/sda". Or just do cat /sys/block/sda/dev.
But if that does not work for you, then I would also like to keep things
simple and translate dev_t to diskname during read routine. Similiarly,
while somebody is putting policy, use blk_lookup_devt().
But this will lead to issue of how do you now display both device number
and disk name in the output. May be following.
major:minor diskname data
I am not sure if people are fond of multiple values in a single file. At
the same time for setting the rules or deleting the rules, it will make
syntax complicated/confusing. Also will require breaking ABI for existing
blkio.time, blkio.sectors, blkio.dequeue files.
So I would prefer to keep the major/minor number based interface for
follwing reasons.
- Chaning it now breaks ABI.
- Other cgroup controller "device" is also using major/minor number based
interface for device access policy. So it is consistent with other
controller.
- Displaying both device major/minor and diskname is an option but that
makes the file format syntax little complicated and new rule setting
or removoal confusing.
Thanks
Vivek
> Thanks,
> Gui
>
> >
> > This has the side effect of fixing the "root cgroup shows no stats"
> > problem that Ricky mentioned.
> >
> > Chad
> >
> > ---
> >
> > Chad Talbott (4):
> > blkio_group key change: void * -> request_queue *
> > Adds an RCU-protected pointer to request_queue that makes it easy to
> > io-controller: Add a new interface "weight_device" for IO-Controller
> > Use disk-names to set blkio.weight_device policy
> >
> >
> > block/blk-cgroup.c | 224 +++++++++++++++++++++++++++++++++++++++++++++---
> > block/blk-cgroup.h | 23 ++++-
> > block/blk-sysfs.c | 4 +
> > block/cfq-iosched.c | 27 ++----
> > include/linux/blkdev.h | 6 +
> > 5 files changed, 248 insertions(+), 36 deletions(-)
> >
> >
> >
> >
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-26 15:20 ` Vivek Goyal
@ 2010-03-26 22:54 ` Chad Talbott
2010-03-26 23:21 ` Divyesh Shah
2010-03-27 0:20 ` Vivek Goyal
0 siblings, 2 replies; 16+ messages in thread
From: Chad Talbott @ 2010-03-26 22:54 UTC (permalink / raw)
To: Vivek Goyal
Cc: Gui Jianfeng, jens.axboe, mrubin, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Fri, Mar 26, 2010 at 8:20 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Fri, Mar 26, 2010 at 09:31:41AM +0800, Gui Jianfeng wrote:
>> +int blk_lookup_devname(dev_t devt, char *name)
>> +{
[ snip... loop through all block devices for devt ...snip ]
>> So we can keep dev_t in blkio layer, and export to user a device name by calling
>> this function. Also, we retrive device number by calling blk_lookup_devt().
>> This change might keep things much simple. Jens, do you have any thoughts?
>>
> I agree with Gui that lets keep the dev_t the core in blkio layer. Keeping
> a pointer to gendisk in request queue is becoming little messy.
Agreed on leaving gendisk pointer out of request_queue. In doing
further investigation, I've found that it's up to the driver to
maintain the association between gendisk and request_queue, and some
drivers put multiple gendisk behind a single request_queue, so the
back pointer would be ill-specified.
> But if that does not work for you, then I would also like to keep things
> simple and translate dev_t to diskname during read routine. Similiarly,
> while somebody is putting policy, use blk_lookup_devt().
I like the simplicity of blk_lookup_devt(), but I don't like the idea
of iterating through all block devices on every lookup of the name.
Perhaps we could cache the name somewhere?
Actually, the name is the name of the *queue* (or the key in
blk-cgroup), because as I mentioned above there can be a many to one
relationship between disks and queues in general.
The more I think about it, the more it seems to make sense to extend
blkio_policy_ops to include a function to get the name of the key.
blk-cgroup makes no current use of the dev, except to invent a name
for the request_queue whose policy is being set or printed. It could
be argued that the thing being scheduled has a better idea of the name
of that thing.
> But this will lead to issue of how do you now display both device number
> and disk name in the output. May be following.
>
> major:minor diskname data
>
> I am not sure if people are fond of multiple values in a single file. At
> the same time for setting the rules or deleting the rules, it will make
> syntax complicated/confusing. Also will require breaking ABI for existing
> blkio.time, blkio.sectors, blkio.dequeue files.
I don't like this, either. It breaks ABI and is more confusing for users.
> So I would prefer to keep the major/minor number based interface for
> follwing reasons.
>
> - Chaning it now breaks ABI.
> - Other cgroup controller "device" is also using major/minor number based
> interface for device access policy. So it is consistent with other
> controller.
Which controllers are these?
> - Displaying both device major/minor and diskname is an option but that
> makes the file format syntax little complicated and new rule setting
> or removoal confusing.
A few messages back you mentioned that you preferred device names
because they would be better for users of the system. If there was a
simple implementation, would you still be behind a new name-based
interface? We could go that direction and maintain ABI by deprecating
current interface and making a new interface with names.
If you can't tell, I'm a big fan of using the name! :) It's *much*
more consistent with the interfaces in /sys.
Chad
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-26 22:54 ` Chad Talbott
@ 2010-03-26 23:21 ` Divyesh Shah
2010-03-27 0:28 ` Vivek Goyal
2010-03-27 0:20 ` Vivek Goyal
1 sibling, 1 reply; 16+ messages in thread
From: Divyesh Shah @ 2010-03-26 23:21 UTC (permalink / raw)
To: Chad Talbott
Cc: Vivek Goyal, Gui Jianfeng, jens.axboe, mrubin, Li Zefan,
linux-kernel, Nauman Rafique
On Fri, Mar 26, 2010 at 3:54 PM, Chad Talbott <ctalbott@google.com> wrote:
> On Fri, Mar 26, 2010 at 8:20 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
>> On Fri, Mar 26, 2010 at 09:31:41AM +0800, Gui Jianfeng wrote:
>>> +int blk_lookup_devname(dev_t devt, char *name)
>>> +{
>
> [ snip... loop through all block devices for devt ...snip ]
>
>>> So we can keep dev_t in blkio layer, and export to user a device name by calling
>>> this function. Also, we retrive device number by calling blk_lookup_devt().
>>> This change might keep things much simple. Jens, do you have any thoughts?
>>>
>> I agree with Gui that lets keep the dev_t the core in blkio layer. Keeping
>> a pointer to gendisk in request queue is becoming little messy.
>
> Agreed on leaving gendisk pointer out of request_queue. In doing
> further investigation, I've found that it's up to the driver to
> maintain the association between gendisk and request_queue, and some
> drivers put multiple gendisk behind a single request_queue, so the
> back pointer would be ill-specified.
>
>> But if that does not work for you, then I would also like to keep things
>> simple and translate dev_t to diskname during read routine. Similiarly,
>> while somebody is putting policy, use blk_lookup_devt().
>
> I like the simplicity of blk_lookup_devt(), but I don't like the idea
> of iterating through all block devices on every lookup of the name.
> Perhaps we could cache the name somewhere?
>
> Actually, the name is the name of the *queue* (or the key in
> blk-cgroup), because as I mentioned above there can be a many to one
> relationship between disks and queues in general.
>
> The more I think about it, the more it seems to make sense to extend
> blkio_policy_ops to include a function to get the name of the key.
> blk-cgroup makes no current use of the dev, except to invent a name
> for the request_queue whose policy is being set or printed. It could
> be argued that the thing being scheduled has a better idea of the name
> of that thing.
>
>> But this will lead to issue of how do you now display both device number
>> and disk name in the output. May be following.
>>
>> major:minor diskname data
>>
>> I am not sure if people are fond of multiple values in a single file. At
>> the same time for setting the rules or deleting the rules, it will make
>> syntax complicated/confusing. Also will require breaking ABI for existing
>> blkio.time, blkio.sectors, blkio.dequeue files.
>
> I don't like this, either. It breaks ABI and is more confusing for users.
>
>> So I would prefer to keep the major/minor number based interface for
>> follwing reasons.
>>
>> - Chaning it now breaks ABI.
>> - Other cgroup controller "device" is also using major/minor number based
>> interface for device access policy. So it is consistent with other
>> controller.
>
> Which controllers are these?
>
>> - Displaying both device major/minor and diskname is an option but that
>> makes the file format syntax little complicated and new rule setting
>> or removoal confusing.
>
> A few messages back you mentioned that you preferred device names
> because they would be better for users of the system. If there was a
> simple implementation, would you still be behind a new name-based
> interface? We could go that direction and maintain ABI by deprecating
> current interface and making a new interface with names.
>
> If you can't tell, I'm a big fan of using the name! :) It's *much*
> more consistent with the interfaces in /sys.
I agree with Chad here. The major/minor number interface to me seems
like a departure from convention as /proc/diskstat, /sys/block all use
the device names at the kernel-user interface. About deprecating the
current ABI, we could do that but do we expect a lot of user tools to
be built around this interface since the 2.6.33 release already?
-Divyesh
>
> Chad
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-26 22:54 ` Chad Talbott
2010-03-26 23:21 ` Divyesh Shah
@ 2010-03-27 0:20 ` Vivek Goyal
2010-03-27 0:24 ` Vivek Goyal
1 sibling, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-03-27 0:20 UTC (permalink / raw)
To: Chad Talbott
Cc: Gui Jianfeng, jens.axboe, mrubin, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Fri, Mar 26, 2010 at 03:54:40PM -0700, Chad Talbott wrote:
> On Fri, Mar 26, 2010 at 8:20 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> > On Fri, Mar 26, 2010 at 09:31:41AM +0800, Gui Jianfeng wrote:
> >> +int blk_lookup_devname(dev_t devt, char *name)
> >> +{
>
> [ snip... loop through all block devices for devt ...snip ]
>
> >> So we can keep dev_t in blkio layer, and export to user a device name by calling
> >> this function. Also, we retrive device number by calling blk_lookup_devt().
> >> This change might keep things much simple. Jens, do you have any thoughts?
> >>
> > I agree with Gui that lets keep the dev_t the core in blkio layer. Keeping
> > a pointer to gendisk in request queue is becoming little messy.
>
> Agreed on leaving gendisk pointer out of request_queue. In doing
> further investigation, I've found that it's up to the driver to
> maintain the association between gendisk and request_queue, and some
> drivers put multiple gendisk behind a single request_queue, so the
> back pointer would be ill-specified.
>
> > But if that does not work for you, then I would also like to keep things
> > simple and translate dev_t to diskname during read routine. Similiarly,
> > while somebody is putting policy, use blk_lookup_devt().
>
> I like the simplicity of blk_lookup_devt(), but I don't like the idea
> of iterating through all block devices on every lookup of the name.
> Perhaps we could cache the name somewhere?
Setting up the policies is not a fast path. So iterating through
devices probably is not a bad idea. Caching infact will make it more
complicated.
It is no different than than /proc/diskstat which is also iterating
through all the devices.
In fact blk_lookup_devt() is also doing the same thing, iterating
through all the block_class devices.
>
> Actually, the name is the name of the *queue* (or the key in
> blk-cgroup), because as I mentioned above there can be a many to one
> relationship between disks and queues in general.
>
What is the significance of one queue serving to multiple gendisk
structures. How this relationship is handled in /sys/block?
> The more I think about it, the more it seems to make sense to extend
> blkio_policy_ops to include a function to get the name of the key.
> blk-cgroup makes no current use of the dev, except to invent a name
> for the request_queue whose policy is being set or printed. It could
> be argued that the thing being scheduled has a better idea of the name
> of that thing.
request_queue is something internal to block layer. To a user, block
deivce will make sense (device file, major:minor, and sysfs disk name).
So keeping a name for request queue also and implmeneting a function
in blkio_policy_ops to retrieve that name, does not make much sense
to me.
>
> > But this will lead to issue of how do you now display both device number
> > and disk name in the output. May be following.
> >
> > major:minor diskname data
> >
> > I am not sure if people are fond of multiple values in a single file. At
> > the same time for setting the rules or deleting the rules, it will make
> > syntax complicated/confusing. Also will require breaking ABI for existing
> > blkio.time, blkio.sectors, blkio.dequeue files.
>
> I don't like this, either. It breaks ABI and is more confusing for users.
>
> > So I would prefer to keep the major/minor number based interface for
> > follwing reasons.
> >
> > - Chaning it now breaks ABI.
> > - Other cgroup controller "device" is also using major/minor number based
> > interface for device access policy. So it is consistent with other
> > controller.
>
> Which controllers are these?
>
linux-2.6/Documentation/cgroups/devices.txt
> > - Displaying both device major/minor and diskname is an option but that
> > makes the file format syntax little complicated and new rule setting
> > or removoal confusing.
>
> A few messages back you mentioned that you preferred device names
> because they would be better for users of the system. If there was a
> simple implementation, would you still be behind a new name-based
> interface? We could go that direction and maintain ABI by deprecating
> current interface and making a new interface with names.
>
> If you can't tell, I'm a big fan of using the name! :) It's *much*
> more consistent with the interfaces in /sys.
/sys provides facility to access device both through device number
(/sys/dev/block/<major:minor>) and disk name (/sys/block/<diskname>). So
I don't know why do you think it is more consistent with /sys if we
use diskname.
In general user space seems to be accessing devices using device files.
For example, blockdev utility. These device file inodes in turn store
major/minor number of device and then get to respective gendisk object.
I think one advantage of major:minor number scheme is that it is simple
and there is no confusion. For example, looks like device mapper layer
can create multiple device files for same disk. On my system, multipath target
has created /dev/mapper/mpathe and also /dev/dm-3 which is referring to
same device. /sys/block has dm-3 created under it.
brw-rw---- 1 root disk 253, 3 2010-03-25 22:57 /dev/mapper/mpathe
brw-rw---- 1 root disk 253, 3 2010-03-25 22:57 /dev/dm-3
Now lets somebody is accessing the device using /dev/mapper/mpathe, he
can easily extract the major:minor of this device and see if there is
any rule for that device. If we keep rules in disk name, then application
shall have to go through the all the /sys/block/diskname entries and
first figure out diskname and then grep for that disk name in rules.
So in case of multiple device files for same disk, I think retrieving
major/minor and then doing lookup in rules file is easier. Keeping rules
in terms of disk names is more intutive in the case of single device
file and diskname being same as device file (/dev/sda).
So because major:minor number scheme is simple, does not break ABI, is
consistent with "devices" cgroup controller, works well in case of
multiple device files for same disk, I kind of like it and wouldn't
mind continuing with it.
Thanks
Vivek
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-27 0:20 ` Vivek Goyal
@ 2010-03-27 0:24 ` Vivek Goyal
2010-03-27 0:30 ` Vivek Goyal
0 siblings, 1 reply; 16+ messages in thread
From: Vivek Goyal @ 2010-03-27 0:24 UTC (permalink / raw)
To: Chad Talbott
Cc: Gui Jianfeng, jens.axboe, mrubin, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Fri, Mar 26, 2010 at 08:20:56PM -0400, Vivek Goyal wrote:
[..]
> > Which controllers are these?
> >
>
> linux-2.6/Documentation/cgroups/devices.txt
>
> > > - Displaying both device major/minor and diskname is an option but that
> > > makes the file format syntax little complicated and new rule setting
> > > or removoal confusing.
> >
> > A few messages back you mentioned that you preferred device names
> > because they would be better for users of the system. If there was a
> > simple implementation, would you still be behind a new name-based
> > interface? We could go that direction and maintain ABI by deprecating
> > current interface and making a new interface with names.
> >
> > If you can't tell, I'm a big fan of using the name! :) It's *much*
> > more consistent with the interfaces in /sys.
>
> /sys provides facility to access device both through device number
> (/sys/dev/block/<major:minor>) and disk name (/sys/block/<diskname>). So
> I don't know why do you think it is more consistent with /sys if we
> use diskname.
>
> In general user space seems to be accessing devices using device files.
> For example, blockdev utility.
One more example is "blktrace".
Thanks
Vivek
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-26 23:21 ` Divyesh Shah
@ 2010-03-27 0:28 ` Vivek Goyal
0 siblings, 0 replies; 16+ messages in thread
From: Vivek Goyal @ 2010-03-27 0:28 UTC (permalink / raw)
To: Divyesh Shah
Cc: Chad Talbott, Gui Jianfeng, jens.axboe, mrubin, Li Zefan,
linux-kernel, Nauman Rafique
On Fri, Mar 26, 2010 at 04:21:41PM -0700, Divyesh Shah wrote:
> On Fri, Mar 26, 2010 at 3:54 PM, Chad Talbott <ctalbott@google.com> wrote:
> > On Fri, Mar 26, 2010 at 8:20 AM, Vivek Goyal <vgoyal@redhat.com> wrote:
> >> On Fri, Mar 26, 2010 at 09:31:41AM +0800, Gui Jianfeng wrote:
> >>> +int blk_lookup_devname(dev_t devt, char *name)
> >>> +{
> >
> > [ snip... loop through all block devices for devt ...snip ]
> >
> >>> So we can keep dev_t in blkio layer, and export to user a device name by calling
> >>> this function. Also, we retrive device number by calling blk_lookup_devt().
> >>> This change might keep things much simple. Jens, do you have any thoughts?
> >>>
> >> I agree with Gui that lets keep the dev_t the core in blkio layer. Keeping
> >> a pointer to gendisk in request queue is becoming little messy.
> >
> > Agreed on leaving gendisk pointer out of request_queue. In doing
> > further investigation, I've found that it's up to the driver to
> > maintain the association between gendisk and request_queue, and some
> > drivers put multiple gendisk behind a single request_queue, so the
> > back pointer would be ill-specified.
> >
> >> But if that does not work for you, then I would also like to keep things
> >> simple and translate dev_t to diskname during read routine. Similiarly,
> >> while somebody is putting policy, use blk_lookup_devt().
> >
> > I like the simplicity of blk_lookup_devt(), but I don't like the idea
> > of iterating through all block devices on every lookup of the name.
> > Perhaps we could cache the name somewhere?
> >
> > Actually, the name is the name of the *queue* (or the key in
> > blk-cgroup), because as I mentioned above there can be a many to one
> > relationship between disks and queues in general.
> >
> > The more I think about it, the more it seems to make sense to extend
> > blkio_policy_ops to include a function to get the name of the key.
> > blk-cgroup makes no current use of the dev, except to invent a name
> > for the request_queue whose policy is being set or printed. It could
> > be argued that the thing being scheduled has a better idea of the name
> > of that thing.
> >
> >> But this will lead to issue of how do you now display both device number
> >> and disk name in the output. May be following.
> >>
> >> major:minor diskname data
> >>
> >> I am not sure if people are fond of multiple values in a single file. At
> >> the same time for setting the rules or deleting the rules, it will make
> >> syntax complicated/confusing. Also will require breaking ABI for existing
> >> blkio.time, blkio.sectors, blkio.dequeue files.
> >
> > I don't like this, either. It breaks ABI and is more confusing for users.
> >
> >> So I would prefer to keep the major/minor number based interface for
> >> follwing reasons.
> >>
> >> - Chaning it now breaks ABI.
> >> - Other cgroup controller "device" is also using major/minor number based
> >> interface for device access policy. So it is consistent with other
> >> controller.
> >
> > Which controllers are these?
> >
> >> - Displaying both device major/minor and diskname is an option but that
> >> makes the file format syntax little complicated and new rule setting
> >> or removoal confusing.
> >
> > A few messages back you mentioned that you preferred device names
> > because they would be better for users of the system. If there was a
> > simple implementation, would you still be behind a new name-based
> > interface? We could go that direction and maintain ABI by deprecating
> > current interface and making a new interface with names.
> >
> > If you can't tell, I'm a big fan of using the name! :) It's *much*
> > more consistent with the interfaces in /sys.
>
> I agree with Chad here. The major/minor number interface to me seems
> like a departure from convention as /proc/diskstat,
Both /proc/diskstats and /proc/partitions list first major/minor and
then diskname. So why do you think it is departuture from convention?
> /sys/block all use
> the device names at the kernel-user interface.
/sys provides multiple ways to access samve device. Both using disknames
as well as major:minor number (/sys/dev/block/major:minor).
Vivek
>About deprecating the
> current ABI, we could do that but do we expect a lot of user tools to
> be built around this interface since the 2.6.33 release already?
>
> -Divyesh
>
> >
> > Chad
> >
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/4] io-controller: Use names rather than major:minor
2010-03-27 0:24 ` Vivek Goyal
@ 2010-03-27 0:30 ` Vivek Goyal
0 siblings, 0 replies; 16+ messages in thread
From: Vivek Goyal @ 2010-03-27 0:30 UTC (permalink / raw)
To: Chad Talbott
Cc: Gui Jianfeng, jens.axboe, mrubin, Li Zefan, linux-kernel, dpshah,
Nauman Rafique
On Fri, Mar 26, 2010 at 08:24:04PM -0400, Vivek Goyal wrote:
> On Fri, Mar 26, 2010 at 08:20:56PM -0400, Vivek Goyal wrote:
>
> [..]
> > > Which controllers are these?
> > >
> >
> > linux-2.6/Documentation/cgroups/devices.txt
> >
> > > > - Displaying both device major/minor and diskname is an option but that
> > > > makes the file format syntax little complicated and new rule setting
> > > > or removoal confusing.
> > >
> > > A few messages back you mentioned that you preferred device names
> > > because they would be better for users of the system. If there was a
> > > simple implementation, would you still be behind a new name-based
> > > interface? We could go that direction and maintain ABI by deprecating
> > > current interface and making a new interface with names.
> > >
> > > If you can't tell, I'm a big fan of using the name! :) It's *much*
> > > more consistent with the interfaces in /sys.
> >
> > /sys provides facility to access device both through device number
> > (/sys/dev/block/<major:minor>) and disk name (/sys/block/<diskname>). So
> > I don't know why do you think it is more consistent with /sys if we
> > use diskname.
> >
> > In general user space seems to be accessing devices using device files.
> > For example, blockdev utility.
>
> One more example is "blktrace".
Another example is "hdparm"
Vivek
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2010-03-27 1:03 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-25 18:04 [PATCH 0/4] io-controller: Use names rather than major:minor Chad Talbott
2010-03-25 18:04 ` [PATCH 1/4] blkio_group key change: void * -> request_queue * Chad Talbott
2010-03-25 23:25 ` Vivek Goyal
2010-03-26 0:17 ` Chad Talbott
2010-03-25 18:04 ` [PATCH 2/4] Adds an RCU-protected pointer to request_queue that makes it easy to Chad Talbott
2010-03-25 23:02 ` Vivek Goyal
2010-03-25 18:04 ` [PATCH 3/4] io-controller: Add a new interface "weight_device" for IO-Controller Chad Talbott
2010-03-25 18:04 ` [PATCH 4/4] Use disk-names to set blkio.weight_device policy Chad Talbott
2010-03-26 1:31 ` [PATCH 0/4] io-controller: Use names rather than major:minor Gui Jianfeng
2010-03-26 15:20 ` Vivek Goyal
2010-03-26 22:54 ` Chad Talbott
2010-03-26 23:21 ` Divyesh Shah
2010-03-27 0:28 ` Vivek Goyal
2010-03-27 0:20 ` Vivek Goyal
2010-03-27 0:24 ` Vivek Goyal
2010-03-27 0:30 ` Vivek Goyal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox