From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Andrea Righi <righi.andrea@gmail.com>
Cc: Paul Menage <menage@google.com>,
randy.dunlap@oracle.com, Carl Henrik Lunde <chlunde@ping.uio.no>,
eric.rannaud@gmail.com, fernando@oss.ntt.co.jp,
dradford@bluehost.com, agk@sourceware.org,
subrata@linux.vnet.ibm.com, axboe@kernel.dk,
akpm@linux-foundation.org, containers@lists.linux-foundation.org,
linux-kernel@vger.kernel.org, dave@linux.vnet.ibm.com,
matt@bluehost.com, roberto@unbit.it, ngupta@google.com
Subject: Re: [PATCH 3/9] bio-cgroup controller
Date: Fri, 17 Apr 2009 15:52:14 +0530 [thread overview]
Message-ID: <20090417102214.GC3896@balbir.in.ibm.com> (raw)
In-Reply-To: <1239740480-28125-4-git-send-email-righi.andrea@gmail.com>
* Andrea Righi <righi.andrea@gmail.com> [2009-04-14 22:21:14]:
> From: Ryo Tsuruta <ryov@valinux.co.jp>
>
> From: Ryo Tsuruta <ryov@valinux.co.jp>
>
> With writeback IO processed asynchronously by kernel threads (pdflush)
> the real writes to the underlying block devices can occur in a different
> IO context respect to the task that originally generated the dirty
> pages involved in the IO operation.
>
> The controller bio-cgroup is used by io-throttle to track writeback IO
> and for properly apply throttling.
>
> Also apply a patch by Gui Jianfeng to announce tasks moving in
> bio-cgroup groups.
>
> See also: http://people.valinux.co.jp/~ryov/bio-cgroup
>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
> Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
> ---
> block/blk-ioc.c | 30 ++--
> fs/buffer.c | 2 +
> fs/direct-io.c | 2 +
> include/linux/biotrack.h | 95 +++++++++++
> include/linux/cgroup_subsys.h | 6 +
> include/linux/iocontext.h | 1 +
> include/linux/memcontrol.h | 6 +
> include/linux/mmzone.h | 4 +-
> include/linux/page_cgroup.h | 13 ++-
> init/Kconfig | 15 ++
> mm/Makefile | 4 +-
> mm/biotrack.c | 349 +++++++++++++++++++++++++++++++++++++++++
> mm/bounce.c | 2 +
> mm/filemap.c | 2 +
> mm/memcontrol.c | 5 +
> mm/memory.c | 5 +
> mm/page-writeback.c | 2 +
> mm/page_cgroup.c | 17 ++-
> mm/swap_state.c | 2 +
> 19 files changed, 536 insertions(+), 26 deletions(-)
> create mode 100644 include/linux/biotrack.h
> create mode 100644 mm/biotrack.c
>
> diff --git a/block/blk-ioc.c b/block/blk-ioc.c
> index 012f065..ef8cac0 100644
> --- a/block/blk-ioc.c
> +++ b/block/blk-ioc.c
> @@ -84,24 +84,28 @@ void exit_io_context(void)
> }
> }
>
> +void init_io_context(struct io_context *ioc)
> +{
> + atomic_set(&ioc->refcount, 1);
> + atomic_set(&ioc->nr_tasks, 1);
> + spin_lock_init(&ioc->lock);
> + ioc->ioprio_changed = 0;
> + ioc->ioprio = 0;
> + ioc->last_waited = jiffies; /* doesn't matter... */
> + ioc->nr_batch_requests = 0; /* because this is 0 */
> + ioc->aic = NULL;
> + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
> + INIT_HLIST_HEAD(&ioc->cic_list);
> + ioc->ioc_data = NULL;
> +}
> +
> struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
> {
> struct io_context *ret;
>
> ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
> - if (ret) {
> - atomic_set(&ret->refcount, 1);
> - atomic_set(&ret->nr_tasks, 1);
> - spin_lock_init(&ret->lock);
> - ret->ioprio_changed = 0;
> - ret->ioprio = 0;
> - ret->last_waited = jiffies; /* doesn't matter... */
> - ret->nr_batch_requests = 0; /* because this is 0 */
> - ret->aic = NULL;
> - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
> - INIT_HLIST_HEAD(&ret->cic_list);
> - ret->ioc_data = NULL;
> - }
> + if (ret)
> + init_io_context(ret);
>
Can you split this part of the patch out as a refactoring patch?
> return ret;
> }
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 13edf7a..bc72150 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -36,6 +36,7 @@
> #include <linux/buffer_head.h>
> #include <linux/task_io_accounting_ops.h>
> #include <linux/bio.h>
> +#include <linux/biotrack.h>
> #include <linux/notifier.h>
> #include <linux/cpu.h>
> #include <linux/bitops.h>
> @@ -655,6 +656,7 @@ static void __set_page_dirty(struct page *page,
> if (page->mapping) { /* Race with truncate? */
> WARN_ON_ONCE(warn && !PageUptodate(page));
> account_page_dirtied(page, mapping);
> + bio_cgroup_reset_owner_pagedirty(page, current->mm);
> radix_tree_tag_set(&mapping->page_tree,
> page_index(page), PAGECACHE_TAG_DIRTY);
> }
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index da258e7..ec42362 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -33,6 +33,7 @@
> #include <linux/err.h>
> #include <linux/blkdev.h>
> #include <linux/buffer_head.h>
> +#include <linux/biotrack.h>
> #include <linux/rwsem.h>
> #include <linux/uio.h>
> #include <asm/atomic.h>
> @@ -799,6 +800,7 @@ static int do_direct_IO(struct dio *dio)
> ret = PTR_ERR(page);
> goto out;
> }
> + bio_cgroup_reset_owner(page, current->mm);
>
> while (block_in_page < blocks_per_page) {
> unsigned offset_in_page = block_in_page << blkbits;
> diff --git a/include/linux/biotrack.h b/include/linux/biotrack.h
> new file mode 100644
> index 0000000..25b8810
> --- /dev/null
> +++ b/include/linux/biotrack.h
> @@ -0,0 +1,95 @@
> +#include <linux/cgroup.h>
> +#include <linux/mm.h>
> +#include <linux/page_cgroup.h>
> +
> +#ifndef _LINUX_BIOTRACK_H
> +#define _LINUX_BIOTRACK_H
> +
> +#ifdef CONFIG_CGROUP_BIO
> +
> +struct tsk_move_msg {
> + int old_id;
> + int new_id;
> + struct task_struct *tsk;
> +};
> +
> +extern int register_biocgroup_notifier(struct notifier_block *nb);
> +extern int unregister_biocgroup_notifier(struct notifier_block *nb);
> +
> +struct io_context;
> +struct block_device;
> +
> +struct bio_cgroup {
> + struct cgroup_subsys_state css;
> + int id;
Can't css_id be used here?
> + struct io_context *io_context; /* default io_context */
> +/* struct radix_tree_root io_context_root; per device io_context */
Commented out code? Do you want to remove this.
> +};
> +
> +static inline void __init_bio_page_cgroup(struct page_cgroup *pc)
> +{
> + pc->bio_cgroup_id = 0;
> +}
> +
> +extern struct cgroup *get_cgroup_from_page(struct page *page);
> +extern void put_cgroup_from_page(struct page *page);
> +extern struct cgroup *bio_id_to_cgroup(int id);
> +
> +static inline int bio_cgroup_disabled(void)
> +{
> + return bio_cgroup_subsys.disabled;
> +}
> +
> +extern void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm);
> +extern void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm);
> +extern void bio_cgroup_reset_owner_pagedirty(struct page *page,
> + struct mm_struct *mm);
> +extern void bio_cgroup_copy_owner(struct page *page, struct page *opage);
> +
> +extern struct io_context *get_bio_cgroup_iocontext(struct bio *bio);
> +extern int get_bio_cgroup_id(struct bio *bio);
> +
> +#else /* CONFIG_CGROUP_BIO */
> +
> +struct bio_cgroup;
> +
Comments? Docbook style would be nice for the functions below.
> +static inline void __init_bio_page_cgroup(struct page_cgroup *pc)
> +{
> +}
> +
> +static inline int bio_cgroup_disabled(void)
> +{
> + return 1;
> +}
> +
> +static inline void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
> +{
> +}
> +
> +static inline void bio_cgroup_reset_owner(struct page *page,
> + struct mm_struct *mm)
> +{
> +}
> +
> +static inline void bio_cgroup_reset_owner_pagedirty(struct page *page,
> + struct mm_struct *mm)
> +{
> +}
> +
> +static inline void bio_cgroup_copy_owner(struct page *page, struct page *opage)
> +{
> +}
> +
> +static inline struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
> +{
> + return NULL;
> +}
> +
> +static inline int get_bio_cgroup_id(struct bio *bio)
> +{
> + return 0;
> +}
> +
> +#endif /* CONFIG_CGROUP_BIO */
> +
> +#endif /* _LINUX_BIOTRACK_H */
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c8d31b..5df23f8 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -43,6 +43,12 @@ SUBSYS(mem_cgroup)
>
> /* */
>
> +#ifdef CONFIG_CGROUP_BIO
> +SUBSYS(bio_cgroup)
> +#endif
> +
> +/* */
> +
> #ifdef CONFIG_CGROUP_DEVICE
> SUBSYS(devices)
> #endif
> diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
> index 08b987b..be37c27 100644
> --- a/include/linux/iocontext.h
> +++ b/include/linux/iocontext.h
> @@ -104,6 +104,7 @@ int put_io_context(struct io_context *ioc);
> void exit_io_context(void);
> struct io_context *get_io_context(gfp_t gfp_flags, int node);
> struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
> +void init_io_context(struct io_context *ioc);
> void copy_io_context(struct io_context **pdst, struct io_context **psrc);
> #else
> static inline void exit_io_context(void)
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 18146c9..f3e0e64 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -37,6 +37,8 @@ struct mm_struct;
> * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
> */
>
> +extern void __init_mem_page_cgroup(struct page_cgroup *pc);
> +
> extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask);
> /* for swap handling */
> @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct *task);
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct mem_cgroup;
>
> +static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +}
> +
> static inline int mem_cgroup_newpage_charge(struct page *page,
> struct mm_struct *mm, gfp_t gfp_mask)
> {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 186ec6a..47a6f55 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -607,7 +607,7 @@ typedef struct pglist_data {
> int nr_zones;
> #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
> struct page *node_mem_map;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_CGROUP_PAGE
> struct page_cgroup *node_page_cgroup;
> #endif
> #endif
> @@ -958,7 +958,7 @@ struct mem_section {
>
> /* See declaration of similar field in struct zone */
> unsigned long *pageblock_flags;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_CGROUP_PAGE
> /*
> * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
> * section. (see memcontrol.h/page_cgroup.h about this.)
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7339c7b..a7249bb 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,7 +1,7 @@
> #ifndef __LINUX_PAGE_CGROUP_H
> #define __LINUX_PAGE_CGROUP_H
>
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_CGROUP_PAGE
> #include <linux/bit_spinlock.h>
> /*
> * Page Cgroup can be considered as an extended mem_map.
> @@ -12,9 +12,16 @@
> */
> struct page_cgroup {
> unsigned long flags;
> - struct mem_cgroup *mem_cgroup;
> struct page *page;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> + struct mem_cgroup *mem_cgroup;
> +#endif
> +#ifdef CONFIG_CGROUP_BIO
> + int bio_cgroup_id;
> +#endif
> +#if defined(CONFIG_CGROUP_MEM_RES_CTLR) || defined(CONFIG_CGROUP_BIO)
> struct list_head lru; /* per cgroup LRU list */
Do we need the #if defined clause? Anyone using page_cgroup, but not
list_head LRU needs to be explicitly covered when they come up.
> +#endif
> };
>
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
> @@ -71,7 +78,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_CGROUP_PAGE */
> struct page_cgroup;
>
> static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> diff --git a/init/Kconfig b/init/Kconfig
> index 7be4d38..8f7b23c 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -606,8 +606,23 @@ config CGROUP_MEM_RES_CTLR_SWAP
> Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> size is 4096bytes, 512k per 1Gbytes of swap.
>
> +config CGROUP_BIO
> + bool "Block I/O cgroup subsystem"
> + depends on CGROUPS && BLOCK
> + select MM_OWNER
> + help
> + Provides a Resource Controller which enables to track the onwner
> + of every Block I/O requests.
> + The information this subsystem provides can be used from any
> + kind of module such as dm-ioband device mapper modules or
> + the cfq-scheduler.
> +
> endif # CGROUPS
>
> +config CGROUP_PAGE
> + def_bool y
> + depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO
> +
> config MM_OWNER
> bool
>
> diff --git a/mm/Makefile b/mm/Makefile
> index ec73c68..a78a437 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,4 +37,6 @@ else
> obj-$(CONFIG_SMP) += allocpercpu.o
> endif
> obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
> +obj-$(CONFIG_CGROUP_BIO) += biotrack.o
> diff --git a/mm/biotrack.c b/mm/biotrack.c
> new file mode 100644
> index 0000000..d3a35f1
> --- /dev/null
> +++ b/mm/biotrack.c
> @@ -0,0 +1,349 @@
> +/* biotrack.c - Block I/O Tracking
> + *
> + * Copyright (C) VA Linux Systems Japan, 2008
> + * Developed by Hirokazu Takahashi <taka@valinux.co.jp>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/smp.h>
> +#include <linux/bit_spinlock.h>
> +#include <linux/idr.h>
> +#include <linux/blkdev.h>
> +#include <linux/biotrack.h>
> +
> +#define MOVETASK 0
> +static BLOCKING_NOTIFIER_HEAD(biocgroup_chain);
> +
> +int register_biocgroup_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&biocgroup_chain, nb);
> +}
> +EXPORT_SYMBOL(register_biocgroup_notifier);
> +
> +int unregister_biocgroup_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_unregister(&biocgroup_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_biocgroup_notifier);
> +
> +/*
> + * The block I/O tracking mechanism is implemented on the cgroup memory
> + * controller framework. It helps to find the the owner of an I/O request
> + * because every I/O request has a target page and the owner of the page
> + * can be easily determined on the framework.
> + */
> +
> +/* Return the bio_cgroup that associates with a cgroup. */
> +static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
> +{
> + return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
> + struct bio_cgroup, css);
> +}
> +
> +/* Return the bio_cgroup that associates with a process. */
> +static inline struct bio_cgroup *bio_cgroup_from_task(struct task_struct *p)
> +{
> + return container_of(task_subsys_state(p, bio_cgroup_subsys_id),
> + struct bio_cgroup, css);
> +}
> +
> +static struct idr bio_cgroup_id;
> +static DEFINE_SPINLOCK(bio_cgroup_idr_lock);
> +static struct io_context default_bio_io_context;
> +static struct bio_cgroup default_bio_cgroup = {
> + .id = 0,
> + .io_context = &default_bio_io_context,
> +};
> +
> +/*
> + * This function is used to make a given page have the bio-cgroup id of
> + * the owner of this page.
> + */
> +void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
> +{
> + struct bio_cgroup *biog;
> + struct page_cgroup *pc;
> +
> + if (bio_cgroup_disabled())
> + return;
> + pc = lookup_page_cgroup(page);
> + if (unlikely(!pc))
> + return;
> +
Is this routine called with lock_page_cgroup() taken? Otherwise
what protects pc->bio_cgroup_id?
> + pc->bio_cgroup_id = 0; /* 0: default bio_cgroup id */
> + if (!mm)
> + return;
> + /*
> + * Locking "pc" isn't necessary here since the current process is
> + * the only one that can access the members related to bio_cgroup.
> + */
> + rcu_read_lock();
> + biog = bio_cgroup_from_task(rcu_dereference(mm->owner));
> + if (unlikely(!biog))
> + goto out;
> + /*
> + * css_get(&bio->css) isn't called to increment the reference
> + * count of this bio_cgroup "biog" so pc->bio_cgroup_id might turn
> + * invalid even if this page is still active.
> + * This approach is chosen to minimize the overhead.
> + */
> + pc->bio_cgroup_id = biog->id;
What happens without ref count increase we delete the cgroup or css?
> +out:
> + rcu_read_unlock();
> +}
> +
> +/*
> + * Change the owner of a given page if necessary.
> + */
> +void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm)
> +{
> + /*
> + * A little trick:
> + * Just call bio_cgroup_set_owner() for pages which are already
> + * active since the bio_cgroup_id member of page_cgroup can be
> + * updated without any locks. This is because an integer type of
> + * variable can be set a new value at once on modern cpus.
> + */
> + bio_cgroup_set_owner(page, mm);
> +}
> +
> +/*
> + * Change the owner of a given page. This function is only effective for
> + * pages in the pagecache.
Could you clarify pagecache? mapped/unmapped or both?
> + */
> +void bio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm)
> +{
> + if (PageSwapCache(page) || PageAnon(page))
> + return;
Look at page_is_file_cache() depending on the answer above
> + if (current->flags & PF_MEMALLOC)
> + return;
> +
> + bio_cgroup_reset_owner(page, mm);
> +}
> +
> +/*
> + * Assign "page" the same owner as "opage."
> + */
> +void bio_cgroup_copy_owner(struct page *npage, struct page *opage)
> +{
> + struct page_cgroup *npc, *opc;
> +
> + if (bio_cgroup_disabled())
> + return;
> + npc = lookup_page_cgroup(npage);
> + if (unlikely(!npc))
> + return;
> + opc = lookup_page_cgroup(opage);
> + if (unlikely(!opc))
> + return;
> +
> + /*
> + * Do this without any locks. The reason is the same as
> + * bio_cgroup_reset_owner().
> + */
> + npc->bio_cgroup_id = opc->bio_cgroup_id;
What protects npc and opc?
> +}
> +
> +/* Create a new bio-cgroup. */
> +static struct cgroup_subsys_state *
> +bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> + struct bio_cgroup *biog;
> + struct io_context *ioc;
> + int ret;
> +
> + if (!cgrp->parent) {
> + biog = &default_bio_cgroup;
> + init_io_context(biog->io_context);
> + /* Increment the referrence count not to be released ever. */
> + atomic_inc(&biog->io_context->refcount);
> + idr_init(&bio_cgroup_id);
> + return &biog->css;
> + }
> +
> + biog = kzalloc(sizeof(*biog), GFP_KERNEL);
> + ioc = alloc_io_context(GFP_KERNEL, -1);
> + if (!ioc || !biog) {
> + ret = -ENOMEM;
> + goto out_err;
> + }
> + biog->io_context = ioc;
> +retry:
> + if (!idr_pre_get(&bio_cgroup_id, GFP_KERNEL)) {
> + ret = -EAGAIN;
> + goto out_err;
> + }
> + spin_lock_irq(&bio_cgroup_idr_lock);
> + ret = idr_get_new_above(&bio_cgroup_id, (void *)biog, 1, &biog->id);
> + spin_unlock_irq(&bio_cgroup_idr_lock);
> + if (ret == -EAGAIN)
> + goto retry;
> + else if (ret)
> + goto out_err;
> +
> + return &biog->css;
> +out_err:
> + kfree(biog);
> + if (ioc)
> + put_io_context(ioc);
> + return ERR_PTR(ret);
> +}
> +
> +/* Delete the bio-cgroup. */
> +static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> + struct bio_cgroup *biog = cgroup_bio(cgrp);
> +
> + put_io_context(biog->io_context);
> +
> + spin_lock_irq(&bio_cgroup_idr_lock);
> + idr_remove(&bio_cgroup_id, biog->id);
> + spin_unlock_irq(&bio_cgroup_idr_lock);
> +
> + kfree(biog);
> +}
> +
> +static struct bio_cgroup *find_bio_cgroup(int id)
> +{
> + struct bio_cgroup *biog;
> + spin_lock_irq(&bio_cgroup_idr_lock);
> + /*
> + * It might fail to find A bio-group associated with "id" since it
> + * is allowed to remove the bio-cgroup even when some of I/O requests
> + * this group issued haven't completed yet.
> + */
> + biog = (struct bio_cgroup *)idr_find(&bio_cgroup_id, id);
> + spin_unlock_irq(&bio_cgroup_idr_lock);
> + return biog;
> +}
> +
> +struct cgroup *bio_id_to_cgroup(int id)
> +{
> + struct bio_cgroup *biog;
> +
> + biog = find_bio_cgroup(id);
> + if (biog)
> + return biog->css.cgroup;
> +
> + return NULL;
> +}
> +
> +struct cgroup *get_cgroup_from_page(struct page *page)
> +{
> + struct page_cgroup *pc;
> + struct bio_cgroup *biog;
> + struct cgroup *cgrp = NULL;
> +
> + pc = lookup_page_cgroup(page);
> + if (!pc)
> + return NULL;
> + lock_page_cgroup(pc);
> + biog = find_bio_cgroup(pc->bio_cgroup_id);
> + if (biog) {
> + css_get(&biog->css);
> + cgrp = biog->css.cgroup;
> + }
> + unlock_page_cgroup(pc);
> + return cgrp;
> +}
> +
> +void put_cgroup_from_page(struct page *page)
> +{
> + struct bio_cgroup *biog;
> + struct page_cgroup *pc;
> +
> + pc = lookup_page_cgroup(page);
> + if (!pc)
> + return;
> + lock_page_cgroup(pc);
> + biog = find_bio_cgroup(pc->bio_cgroup_id);
> + if (biog)
> + css_put(&biog->css);
> + unlock_page_cgroup(pc);
> +}
> +
> +/* Determine the bio-cgroup id of a given bio. */
> +int get_bio_cgroup_id(struct bio *bio)
> +{
> + struct page_cgroup *pc;
> + struct page *page = bio_iovec_idx(bio, 0)->bv_page;
> + int id = 0;
> +
> + pc = lookup_page_cgroup(page);
> + if (pc)
> + id = pc->bio_cgroup_id;
> + return id;
> +}
> +EXPORT_SYMBOL(get_bio_cgroup_id);
> +
> +/* Determine the iocontext of the bio-cgroup that issued a given bio. */
> +struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
> +{
> + struct bio_cgroup *biog = NULL;
> + struct io_context *ioc;
> + int id = 0;
> +
> + id = get_bio_cgroup_id(bio);
> + if (id)
> + biog = find_bio_cgroup(id);
> + if (!biog)
> + biog = &default_bio_cgroup;
> + ioc = biog->io_context; /* default io_context for this cgroup */
> + atomic_inc(&ioc->refcount);
> + return ioc;
> +}
> +EXPORT_SYMBOL(get_bio_cgroup_iocontext);
> +
> +static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct bio_cgroup *biog = cgroup_bio(cgrp);
> + return (u64) biog->id;
> +}
> +
> +
> +static struct cftype bio_files[] = {
> + {
> + .name = "id",
> + .read_u64 = bio_id_read,
> + },
> +};
> +
> +static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> + return cgroup_add_files(cgrp, ss, bio_files, ARRAY_SIZE(bio_files));
> +}
> +
> +static void bio_cgroup_attach(struct cgroup_subsys *ss,
> + struct cgroup *cont, struct cgroup *oldcont,
> + struct task_struct *tsk)
> +{
> + struct tsk_move_msg tmm;
> + struct bio_cgroup *old_biog, *new_biog;
> +
> + old_biog = cgroup_bio(oldcont);
> + new_biog = cgroup_bio(cont);
> + tmm.old_id = old_biog->id;
> + tmm.new_id = new_biog->id;
> + tmm.tsk = tsk;
> + blocking_notifier_call_chain(&biocgroup_chain, MOVETASK, &tmm);
> +}
> +
> +struct cgroup_subsys bio_cgroup_subsys = {
> + .name = "bio",
> + .create = bio_cgroup_create,
> + .destroy = bio_cgroup_destroy,
> + .populate = bio_cgroup_populate,
> + .attach = bio_cgroup_attach,
> + .subsys_id = bio_cgroup_subsys_id,
> +};
> +
> diff --git a/mm/bounce.c b/mm/bounce.c
> index e590272..1a01905 100644
> --- a/mm/bounce.c
> +++ b/mm/bounce.c
> @@ -14,6 +14,7 @@
> #include <linux/hash.h>
> #include <linux/highmem.h>
> #include <linux/blktrace_api.h>
> +#include <linux/biotrack.h>
> #include <trace/block.h>
> #include <asm/tlbflush.h>
>
> @@ -212,6 +213,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
> to->bv_len = from->bv_len;
> to->bv_offset = from->bv_offset;
> inc_zone_page_state(to->bv_page, NR_BOUNCE);
> + bio_cgroup_copy_owner(to->bv_page, page);
>
> if (rw == WRITE) {
> char *vto, *vfrom;
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8bd4980..1ab32a2 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -33,6 +33,7 @@
> #include <linux/cpuset.h>
> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
> #include <linux/memcontrol.h>
> +#include <linux/biotrack.h>
> #include <linux/mm_inline.h> /* for page_is_file_cache() */
> #include "internal.h"
>
> @@ -463,6 +464,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
> gfp_mask & GFP_RECLAIM_MASK);
> if (error)
> goto out;
> + bio_cgroup_set_owner(page, current->mm);
>
> error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
> if (error == 0) {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e44fb0f..c25eb63 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2524,6 +2524,11 @@ struct cgroup_subsys mem_cgroup_subsys = {
> .use_id = 1,
> };
>
> +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> + pc->mem_cgroup = NULL;
> +}
> +
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
>
> static int __init disable_swap_account(char *s)
> diff --git a/mm/memory.c b/mm/memory.c
> index cf6873e..7779e12 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -51,6 +51,7 @@
> #include <linux/init.h>
> #include <linux/writeback.h>
> #include <linux/memcontrol.h>
> +#include <linux/biotrack.h>
> #include <linux/mmu_notifier.h>
> #include <linux/kallsyms.h>
> #include <linux/swapops.h>
> @@ -2052,6 +2053,7 @@ gotten:
> * thread doing COW.
> */
> ptep_clear_flush_notify(vma, address, page_table);
> + bio_cgroup_set_owner(new_page, mm);
> page_add_new_anon_rmap(new_page, vma, address);
> set_pte_at(mm, address, page_table, entry);
> update_mmu_cache(vma, address, entry);
> @@ -2497,6 +2499,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> flush_icache_page(vma, page);
> set_pte_at(mm, address, page_table, pte);
> page_add_anon_rmap(page, vma, address);
> + bio_cgroup_reset_owner(page, mm);
> /* It's better to call commit-charge after rmap is established */
> mem_cgroup_commit_charge_swapin(page, ptr);
>
> @@ -2559,6 +2562,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
> if (!pte_none(*page_table))
> goto release;
> inc_mm_counter(mm, anon_rss);
> + bio_cgroup_set_owner(page, mm);
> page_add_new_anon_rmap(page, vma, address);
> set_pte_at(mm, address, page_table, entry);
>
> @@ -2711,6 +2715,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> if (anon) {
> inc_mm_counter(mm, anon_rss);
> + bio_cgroup_set_owner(page, mm);
> page_add_new_anon_rmap(page, vma, address);
> } else {
> inc_mm_counter(mm, file_rss);
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 30351f0..1379eb0 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -26,6 +26,7 @@
> #include <linux/blkdev.h>
> #include <linux/mpage.h>
> #include <linux/rmap.h>
> +#include <linux/biotrack.h>
> #include <linux/percpu.h>
> #include <linux/notifier.h>
> #include <linux/smp.h>
> @@ -1243,6 +1244,7 @@ int __set_page_dirty_nobuffers(struct page *page)
> BUG_ON(mapping2 != mapping);
> WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
> account_page_dirtied(page, mapping);
> + bio_cgroup_reset_owner_pagedirty(page, current->mm);
> radix_tree_tag_set(&mapping->page_tree,
> page_index(page), PAGECACHE_TAG_DIRTY);
> }
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 791905c..f692ee2 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -9,13 +9,16 @@
> #include <linux/vmalloc.h>
> #include <linux/cgroup.h>
> #include <linux/swapops.h>
> +#include <linux/memcontrol.h>
> +#include <linux/biotrack.h>
>
> static void __meminit
> __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> {
> pc->flags = 0;
> - pc->mem_cgroup = NULL;
> pc->page = pfn_to_page(pfn);
> + __init_mem_page_cgroup(pc);
> + __init_bio_page_cgroup(pc);
> INIT_LIST_HEAD(&pc->lru);
> }
> static unsigned long total_usage;
> @@ -74,7 +77,7 @@ void __init page_cgroup_init(void)
>
> int nid, fail;
>
> - if (mem_cgroup_disabled())
> + if (mem_cgroup_disabled() && bio_cgroup_disabled())
> return;
>
> for_each_online_node(nid) {
> @@ -83,12 +86,12 @@ void __init page_cgroup_init(void)
> goto fail;
> }
> printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try cgroup_disable=memory option if you"
> + printk(KERN_INFO "please try cgroup_disable=memory,bio option if you"
> " don't want\n");
> return;
> fail:
> printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
> - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
> + printk(KERN_CRIT "please try cgroup_disable=memory,bio boot options\n");
> panic("Out of memory");
> }
>
> @@ -248,7 +251,7 @@ void __init page_cgroup_init(void)
> unsigned long pfn;
> int fail = 0;
>
> - if (mem_cgroup_disabled())
> + if (mem_cgroup_disabled() && bio_cgroup_disabled())
> return;
>
> for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
> @@ -263,8 +266,8 @@ void __init page_cgroup_init(void)
> hotplug_memory_notifier(page_cgroup_callback, 0);
> }
> printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
> - " want\n");
> + printk(KERN_INFO
> + "try cgroup_disable=memory,bio option if you don't want\n");
> }
>
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3ecea98..c7ad256 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -17,6 +17,7 @@
> #include <linux/backing-dev.h>
> #include <linux/pagevec.h>
> #include <linux/migrate.h>
> +#include <linux/biotrack.h>
> #include <linux/page_cgroup.h>
>
> #include <asm/pgtable.h>
> @@ -308,6 +309,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> */
> __set_page_locked(new_page);
> SetPageSwapBacked(new_page);
> + bio_cgroup_set_owner(new_page, current->mm);
> err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
> if (likely(!err)) {
> /*
> --
> 1.5.6.3
>
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
--
Balbir
next prev parent reply other threads:[~2009-04-17 10:31 UTC|newest]
Thread overview: 104+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-04-14 20:21 [PATCH 0/9] cgroup: io-throttle controller (v13) Andrea Righi
2009-04-14 20:21 ` [PATCH 1/9] io-throttle documentation Andrea Righi
2009-04-17 1:24 ` KAMEZAWA Hiroyuki
2009-04-17 1:56 ` Li Zefan
2009-04-17 10:25 ` Andrea Righi
2009-04-17 10:41 ` Andrea Righi
2009-04-17 11:35 ` Fernando Luis Vázquez Cao
2009-04-20 9:38 ` Ryo Tsuruta
2009-04-20 15:00 ` Andrea Righi
2009-04-27 10:45 ` Ryo Tsuruta
2009-04-27 12:15 ` Ryo Tsuruta
2009-04-27 21:56 ` Andrea Righi
2009-04-17 7:34 ` Gui Jianfeng
2009-04-17 7:43 ` KAMEZAWA Hiroyuki
2009-04-17 9:29 ` Gui Jianfeng
2009-04-17 9:55 ` Andrea Righi
2009-04-17 17:39 ` Vivek Goyal
2009-04-17 23:12 ` Andrea Righi
2009-04-19 13:42 ` Vivek Goyal
2009-04-19 15:47 ` Andrea Righi
2009-04-20 21:28 ` Vivek Goyal
2009-04-20 22:05 ` Andrea Righi
2009-04-21 1:08 ` Vivek Goyal
2009-04-21 8:37 ` Andrea Righi
2009-04-21 14:23 ` Vivek Goyal
2009-04-21 18:29 ` Vivek Goyal
2009-04-21 21:36 ` Andrea Righi
2009-04-21 21:28 ` Andrea Righi
2009-04-19 13:54 ` Vivek Goyal
2009-04-14 20:21 ` [PATCH 2/9] res_counter: introduce ratelimiting attributes Andrea Righi
2009-04-14 20:21 ` [PATCH 3/9] bio-cgroup controller Andrea Righi
2009-04-15 2:15 ` KAMEZAWA Hiroyuki
2009-04-15 9:37 ` Andrea Righi
2009-04-15 12:38 ` Ryo Tsuruta
2009-04-15 13:23 ` Andrea Righi
2009-04-15 23:58 ` KAMEZAWA Hiroyuki
2009-04-16 10:42 ` Andrea Righi
2009-04-16 12:00 ` Ryo Tsuruta
2009-04-17 0:04 ` KAMEZAWA Hiroyuki
2009-04-17 9:44 ` Andrea Righi
2009-04-15 13:07 ` Andrea Righi
2009-04-16 22:29 ` Andrew Morton
2009-04-17 0:20 ` KAMEZAWA Hiroyuki
2009-04-17 0:44 ` Andrew Morton
2009-04-17 1:44 ` Ryo Tsuruta
2009-04-17 4:15 ` Andrew Morton
2009-04-17 7:48 ` Ryo Tsuruta
2009-04-17 1:50 ` Balbir Singh
2009-04-17 9:40 ` Andrea Righi
2009-04-17 1:49 ` Takuya Yoshikawa
2009-04-17 2:24 ` KAMEZAWA Hiroyuki
2009-04-17 7:22 ` Ryo Tsuruta
2009-04-17 8:00 ` KAMEZAWA Hiroyuki
2009-04-17 8:48 ` KAMEZAWA Hiroyuki
2009-04-17 8:51 ` KAMEZAWA Hiroyuki
2009-04-17 11:27 ` Block I/O tracking (was Re: [PATCH 3/9] bio-cgroup controller) Fernando Luis Vázquez Cao
2009-04-17 22:09 ` Andrea Righi
2009-04-17 7:32 ` [PATCH 3/9] bio-cgroup controller Ryo Tsuruta
2009-04-17 10:22 ` Balbir Singh [this message]
2009-04-20 11:35 ` Ryo Tsuruta
2009-04-20 14:56 ` Andrea Righi
2009-04-21 11:39 ` Ryo Tsuruta
2009-04-21 15:31 ` Balbir Singh
2009-04-14 20:21 ` [PATCH 4/9] support checking of cgroup subsystem dependencies Andrea Righi
2009-04-14 20:21 ` [PATCH 5/9] io-throttle controller infrastructure Andrea Righi
2009-04-14 20:21 ` [PATCH 6/9] kiothrottled: throttle buffered (writeback) IO Andrea Righi
2009-04-14 20:21 ` [PATCH 7/9] io-throttle instrumentation Andrea Righi
2009-04-14 20:21 ` [PATCH 8/9] export per-task io-throttle statistics to userspace Andrea Righi
2009-04-14 20:21 ` [PATCH 9/9] ext3: do not throttle metadata and journal IO Andrea Righi
2009-04-17 12:38 ` Theodore Tso
2009-04-17 12:50 ` Jens Axboe
2009-04-17 14:39 ` Andrea Righi
2009-04-21 0:18 ` Theodore Tso
2009-04-21 8:30 ` Andrea Righi
2009-04-21 14:06 ` Theodore Tso
2009-04-21 14:31 ` Andrea Righi
2009-04-21 16:35 ` Theodore Tso
2009-04-21 17:23 ` Balbir Singh
2009-04-21 17:46 ` Theodore Tso
2009-04-21 18:14 ` Balbir Singh
2009-04-21 19:14 ` Theodore Tso
2009-04-21 20:49 ` Andrea Righi
2009-04-22 0:33 ` KAMEZAWA Hiroyuki
2009-04-22 1:21 ` KAMEZAWA Hiroyuki
2009-04-22 10:22 ` Andrea Righi
2009-04-23 0:05 ` KAMEZAWA Hiroyuki
2009-04-23 1:22 ` Theodore Tso
2009-04-23 2:54 ` KAMEZAWA Hiroyuki
2009-04-23 4:35 ` Theodore Tso
2009-04-23 4:58 ` Andrew Morton
2009-04-23 5:37 ` KAMEZAWA Hiroyuki
2009-04-23 9:44 ` Andrea Righi
2009-04-23 12:17 ` Theodore Tso
2009-04-23 12:27 ` Theodore Tso
2009-04-23 21:13 ` Andrea Righi
2009-04-24 0:26 ` KAMEZAWA Hiroyuki
2009-04-24 5:14 ` Balbir Singh
2009-04-23 10:03 ` Andrea Righi
2009-04-22 3:30 ` Balbir Singh
2009-04-24 15:10 ` Balbir Singh
2009-04-16 22:24 ` [PATCH 0/9] cgroup: io-throttle controller (v13) Andrew Morton
2009-04-17 9:37 ` Andrea Righi
2009-04-30 13:20 ` Alan D. Brunelle
2009-05-01 11:11 ` Andrea Righi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090417102214.GC3896@balbir.in.ibm.com \
--to=balbir@linux.vnet.ibm.com \
--cc=agk@sourceware.org \
--cc=akpm@linux-foundation.org \
--cc=axboe@kernel.dk \
--cc=chlunde@ping.uio.no \
--cc=containers@lists.linux-foundation.org \
--cc=dave@linux.vnet.ibm.com \
--cc=dradford@bluehost.com \
--cc=eric.rannaud@gmail.com \
--cc=fernando@oss.ntt.co.jp \
--cc=linux-kernel@vger.kernel.org \
--cc=matt@bluehost.com \
--cc=menage@google.com \
--cc=ngupta@google.com \
--cc=randy.dunlap@oracle.com \
--cc=righi.andrea@gmail.com \
--cc=roberto@unbit.it \
--cc=subrata@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).