From: Andrew Morton <akpm@linux-foundation.org>
To: Andrea Righi <righi.andrea@gmail.com>
Cc: balbir@linux.vnet.ibm.com, menage@google.com,
chlunde@ping.uio.no, axboe@kernel.dk, matt@bluehost.com,
roberto@unbit.it, randy.dunlap@oracle.com, dpshah@google.com,
containers@lists.linux-foundation.org,
linux-kernel@vger.kernel.org, righi.andrea@gmail.com
Subject: Re: [PATCH 3/3] i/o accounting and control
Date: Wed, 25 Jun 2008 17:41:08 -0700 [thread overview]
Message-ID: <20080625174108.66a34327.akpm@linux-foundation.org> (raw)
In-Reply-To: <1213956335-29866-4-git-send-email-righi.andrea@gmail.com>
On Fri, 20 Jun 2008 12:05:35 +0200
Andrea Righi <righi.andrea@gmail.com> wrote:
> Apply the io-throttle controller to the opportune kernel functions. Both
> accounting and throttling functionalities are performed by
> cgroup_io_throttle().
>
> Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
> ---
> block/blk-core.c | 2 ++
> fs/buffer.c | 10 ++++++++++
> fs/direct-io.c | 3 +++
> mm/page-writeback.c | 9 +++++++++
> mm/readahead.c | 5 +++++
> 5 files changed, 29 insertions(+), 0 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 1905aab..8eddef5 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -26,6 +26,7 @@
> #include <linux/swap.h>
> #include <linux/writeback.h>
> #include <linux/task_io_accounting_ops.h>
> +#include <linux/blk-io-throttle.h>
> #include <linux/interrupt.h>
> #include <linux/cpu.h>
> #include <linux/blktrace_api.h>
> @@ -1482,6 +1483,7 @@ void submit_bio(int rw, struct bio *bio)
> count_vm_events(PGPGOUT, count);
> } else {
> task_io_account_read(bio->bi_size);
> + cgroup_io_throttle(bio->bi_bdev, bio->bi_size);
We do this on every submit_bio(READ)? Ow.
I guess it's not _hugely_ expensive, but the lengthy pointer hop in
task_subsys_state() is going to cost.
All this could be made cheaper if we were to reduce the sampling rate:
only call cgroup_io_throttle() on each megabyte of IO (for example).
current->amount_of_io += bio->bi_size;
if (current->amount_of_io > 1024*1024) {
cgroup_io_throttle(bio->bi_bdev, bio->bi_size);
current->amount_of_io -= 1024 * 1024;
}
but perhaps that has the potential to fail to throttle correctly when
accessing multiple devices, dunno.
Bear in mind that tasks such as pdflush and kswapd will do reads when
performing writeout (indirect blocks).
> count_vm_events(PGPGIN, count);
> }
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index a073f3f..c63dfe7 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -35,6 +35,7 @@
> #include <linux/suspend.h>
> #include <linux/buffer_head.h>
> #include <linux/task_io_accounting_ops.h>
> +#include <linux/blk-io-throttle.h>
> #include <linux/bio.h>
> #include <linux/notifier.h>
> #include <linux/cpu.h>
> @@ -700,6 +701,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
> static int __set_page_dirty(struct page *page,
> struct address_space *mapping, int warn)
> {
> + struct block_device *bdev = NULL;
> + size_t cgroup_io_acct = 0;
> +
> if (unlikely(!mapping))
> return !TestSetPageDirty(page);
>
> @@ -711,16 +715,22 @@ static int __set_page_dirty(struct page *page,
> WARN_ON_ONCE(warn && !PageUptodate(page));
>
> if (mapping_cap_account_dirty(mapping)) {
> + bdev = (mapping->host &&
> + mapping->host->i_sb->s_bdev) ?
> + mapping->host->i_sb->s_bdev : NULL;
> +
> __inc_zone_page_state(page, NR_FILE_DIRTY);
> __inc_bdi_stat(mapping->backing_dev_info,
> BDI_RECLAIMABLE);
> task_io_account_write(PAGE_CACHE_SIZE);
> + cgroup_io_acct = PAGE_CACHE_SIZE;
> }
> radix_tree_tag_set(&mapping->page_tree,
> page_index(page), PAGECACHE_TAG_DIRTY);
> }
> write_unlock_irq(&mapping->tree_lock);
> __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> + cgroup_io_throttle(bdev, cgroup_io_acct);
Ah, and here we see the reason for the __cant_sleep() hack.
It doesn't work, sorry. in_atomic() will return false when called
under spinlock when CONFIG_PREEMPT=n. Your code will call
schedule_timeout_killable() under spinlock and is deadlockable.
We need to think of something smarter here. This problem has already
been largely solved at the balance_dirty_pages() level. Perhaps that
is where the io-throttling should be cutting in?
> return 1;
> }
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 9e81add..fe991ac 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -35,6 +35,7 @@
> #include <linux/buffer_head.h>
> #include <linux/rwsem.h>
> #include <linux/uio.h>
> +#include <linux/blk-io-throttle.h>
> #include <asm/atomic.h>
>
> /*
> @@ -666,6 +667,8 @@ submit_page_section(struct dio *dio, struct page *page,
> /*
> * Read accounting is performed in submit_bio()
> */
> + struct block_device *bdev = dio->bio ? dio->bio->bi_bdev : NULL;
> + cgroup_io_throttle(bdev, len);
> task_io_account_write(len);
> }
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 789b6ad..a2b820d 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -23,6 +23,7 @@
> #include <linux/init.h>
> #include <linux/backing-dev.h>
> #include <linux/task_io_accounting_ops.h>
> +#include <linux/blk-io-throttle.h>
> #include <linux/blkdev.h>
> #include <linux/mpage.h>
> #include <linux/rmap.h>
> @@ -1077,6 +1078,8 @@ int __set_page_dirty_nobuffers(struct page *page)
> if (!TestSetPageDirty(page)) {
> struct address_space *mapping = page_mapping(page);
> struct address_space *mapping2;
> + struct block_device *bdev = NULL;
> + size_t cgroup_io_acct = 0;
>
> if (!mapping)
> return 1;
> @@ -1087,10 +1090,15 @@ int __set_page_dirty_nobuffers(struct page *page)
> BUG_ON(mapping2 != mapping);
> WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
> if (mapping_cap_account_dirty(mapping)) {
> + bdev = (mapping->host &&
> + mapping->host->i_sb->s_bdev) ?
> + mapping->host->i_sb->s_bdev : NULL;
> +
> __inc_zone_page_state(page, NR_FILE_DIRTY);
> __inc_bdi_stat(mapping->backing_dev_info,
> BDI_RECLAIMABLE);
> task_io_account_write(PAGE_CACHE_SIZE);
> + cgroup_io_acct = PAGE_CACHE_SIZE;
> }
> radix_tree_tag_set(&mapping->page_tree,
> page_index(page), PAGECACHE_TAG_DIRTY);
> @@ -1100,6 +1108,7 @@ int __set_page_dirty_nobuffers(struct page *page)
> /* !PageAnon && !swapper_space */
> __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> }
> + cgroup_io_throttle(bdev, cgroup_io_acct);
> return 1;
> }
> return 0;
> diff --git a/mm/readahead.c b/mm/readahead.c
> index d8723a5..dff6b02 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -14,6 +14,7 @@
> #include <linux/blkdev.h>
> #include <linux/backing-dev.h>
> #include <linux/task_io_accounting_ops.h>
> +#include <linux/blk-io-throttle.h>
> #include <linux/pagevec.h>
> #include <linux/pagemap.h>
>
> @@ -58,6 +59,9 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
> int (*filler)(void *, struct page *), void *data)
> {
> struct page *page;
> + struct block_device *bdev =
> + (mapping->host && mapping->host->i_sb->s_bdev) ?
> + mapping->host->i_sb->s_bdev : NULL;
> int ret = 0;
>
> while (!list_empty(pages)) {
> @@ -76,6 +80,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
> break;
> }
> task_io_account_read(PAGE_CACHE_SIZE);
> + cgroup_io_throttle(bdev, PAGE_CACHE_SIZE);
> }
> return ret;
> }
next prev parent reply other threads:[~2008-06-26 0:41 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-06-20 10:05 [PATCH 3/3] i/o accounting and control Andrea Righi
2008-06-26 0:41 ` Andrew Morton [this message]
2008-06-26 22:37 ` Andrea Righi
2008-06-26 23:15 ` Andrew Morton
-- strict thread matches above, loose matches on Subject: below --
2008-07-04 13:58 Andrea Righi
2008-06-06 22:27 Andrea Righi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080625174108.66a34327.akpm@linux-foundation.org \
--to=akpm@linux-foundation.org \
--cc=axboe@kernel.dk \
--cc=balbir@linux.vnet.ibm.com \
--cc=chlunde@ping.uio.no \
--cc=containers@lists.linux-foundation.org \
--cc=dpshah@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=matt@bluehost.com \
--cc=menage@google.com \
--cc=randy.dunlap@oracle.com \
--cc=righi.andrea@gmail.com \
--cc=roberto@unbit.it \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox