From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755521AbYFZAl7 (ORCPT ); Wed, 25 Jun 2008 20:41:59 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753468AbYFZAlt (ORCPT ); Wed, 25 Jun 2008 20:41:49 -0400 Received: from smtp1.linux-foundation.org ([140.211.169.13]:36244 "EHLO smtp1.linux-foundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751768AbYFZAls (ORCPT ); Wed, 25 Jun 2008 20:41:48 -0400 Date: Wed, 25 Jun 2008 17:41:08 -0700 From: Andrew Morton To: Andrea Righi Cc: balbir@linux.vnet.ibm.com, menage@google.com, chlunde@ping.uio.no, axboe@kernel.dk, matt@bluehost.com, roberto@unbit.it, randy.dunlap@oracle.com, dpshah@google.com, containers@lists.linux-foundation.org, linux-kernel@vger.kernel.org, righi.andrea@gmail.com Subject: Re: [PATCH 3/3] i/o accounting and control Message-Id: <20080625174108.66a34327.akpm@linux-foundation.org> In-Reply-To: <1213956335-29866-4-git-send-email-righi.andrea@gmail.com> References: <1213956335-29866-4-git-send-email-righi.andrea@gmail.com> X-Mailer: Sylpheed version 2.2.4 (GTK+ 2.8.20; i486-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Fri, 20 Jun 2008 12:05:35 +0200 Andrea Righi wrote: > Apply the io-throttle controller to the opportune kernel functions. Both > accounting and throttling functionalities are performed by > cgroup_io_throttle(). > > Signed-off-by: Andrea Righi > --- > block/blk-core.c | 2 ++ > fs/buffer.c | 10 ++++++++++ > fs/direct-io.c | 3 +++ > mm/page-writeback.c | 9 +++++++++ > mm/readahead.c | 5 +++++ > 5 files changed, 29 insertions(+), 0 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 1905aab..8eddef5 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -26,6 +26,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -1482,6 +1483,7 @@ void submit_bio(int rw, struct bio *bio) > count_vm_events(PGPGOUT, count); > } else { > task_io_account_read(bio->bi_size); > + cgroup_io_throttle(bio->bi_bdev, bio->bi_size); We do this on every submit_bio(READ)? Ow. I guess it's not _hugely_ expensive, but the lengthy pointer hop in task_subsys_state() is going to cost. All this could be made cheaper if we were to reduce the sampling rate: only call cgroup_io_throttle() on each megabyte of IO (for example). current->amount_of_io += bio->bi_size; if (current->amount_of_io > 1024*1024) { cgroup_io_throttle(bio->bi_bdev, bio->bi_size); current->amount_of_io -= 1024 * 1024; } but perhaps that has the potential to fail to throttle correctly when accessing multiple devices, dunno. Bear in mind that tasks such as pdflush and kswapd will do reads when performing writeout (indirect blocks). > count_vm_events(PGPGIN, count); > } > > diff --git a/fs/buffer.c b/fs/buffer.c > index a073f3f..c63dfe7 100644 > --- a/fs/buffer.c > +++ b/fs/buffer.c > @@ -35,6 +35,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -700,6 +701,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); > static int __set_page_dirty(struct page *page, > struct address_space *mapping, int warn) > { > + struct block_device *bdev = NULL; > + size_t cgroup_io_acct = 0; > + > if (unlikely(!mapping)) > return !TestSetPageDirty(page); > > @@ -711,16 +715,22 @@ static int __set_page_dirty(struct page *page, > WARN_ON_ONCE(warn && !PageUptodate(page)); > > if (mapping_cap_account_dirty(mapping)) { > + bdev = (mapping->host && > + mapping->host->i_sb->s_bdev) ? > + mapping->host->i_sb->s_bdev : NULL; > + > __inc_zone_page_state(page, NR_FILE_DIRTY); > __inc_bdi_stat(mapping->backing_dev_info, > BDI_RECLAIMABLE); > task_io_account_write(PAGE_CACHE_SIZE); > + cgroup_io_acct = PAGE_CACHE_SIZE; > } > radix_tree_tag_set(&mapping->page_tree, > page_index(page), PAGECACHE_TAG_DIRTY); > } > write_unlock_irq(&mapping->tree_lock); > __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); > + cgroup_io_throttle(bdev, cgroup_io_acct); Ah, and here we see the reason for the __cant_sleep() hack. It doesn't work, sorry. in_atomic() will return false when called under spinlock when CONFIG_PREEMPT=n. Your code will call schedule_timeout_killable() under spinlock and is deadlockable. We need to think of something smarter here. This problem has already been largely solved at the balance_dirty_pages() level. Perhaps that is where the io-throttling should be cutting in? > return 1; > } > diff --git a/fs/direct-io.c b/fs/direct-io.c > index 9e81add..fe991ac 100644 > --- a/fs/direct-io.c > +++ b/fs/direct-io.c > @@ -35,6 +35,7 @@ > #include > #include > #include > +#include > #include > > /* > @@ -666,6 +667,8 @@ submit_page_section(struct dio *dio, struct page *page, > /* > * Read accounting is performed in submit_bio() > */ > + struct block_device *bdev = dio->bio ? dio->bio->bi_bdev : NULL; > + cgroup_io_throttle(bdev, len); > task_io_account_write(len); > } > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 789b6ad..a2b820d 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -23,6 +23,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -1077,6 +1078,8 @@ int __set_page_dirty_nobuffers(struct page *page) > if (!TestSetPageDirty(page)) { > struct address_space *mapping = page_mapping(page); > struct address_space *mapping2; > + struct block_device *bdev = NULL; > + size_t cgroup_io_acct = 0; > > if (!mapping) > return 1; > @@ -1087,10 +1090,15 @@ int __set_page_dirty_nobuffers(struct page *page) > BUG_ON(mapping2 != mapping); > WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); > if (mapping_cap_account_dirty(mapping)) { > + bdev = (mapping->host && > + mapping->host->i_sb->s_bdev) ? > + mapping->host->i_sb->s_bdev : NULL; > + > __inc_zone_page_state(page, NR_FILE_DIRTY); > __inc_bdi_stat(mapping->backing_dev_info, > BDI_RECLAIMABLE); > task_io_account_write(PAGE_CACHE_SIZE); > + cgroup_io_acct = PAGE_CACHE_SIZE; > } > radix_tree_tag_set(&mapping->page_tree, > page_index(page), PAGECACHE_TAG_DIRTY); > @@ -1100,6 +1108,7 @@ int __set_page_dirty_nobuffers(struct page *page) > /* !PageAnon && !swapper_space */ > __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); > } > + cgroup_io_throttle(bdev, cgroup_io_acct); > return 1; > } > return 0; > diff --git a/mm/readahead.c b/mm/readahead.c > index d8723a5..dff6b02 100644 > --- a/mm/readahead.c > +++ b/mm/readahead.c > @@ -14,6 +14,7 @@ > #include > #include > #include > +#include > #include > #include > > @@ -58,6 +59,9 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, > int (*filler)(void *, struct page *), void *data) > { > struct page *page; > + struct block_device *bdev = > + (mapping->host && mapping->host->i_sb->s_bdev) ? > + mapping->host->i_sb->s_bdev : NULL; > int ret = 0; > > while (!list_empty(pages)) { > @@ -76,6 +80,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, > break; > } > task_io_account_read(PAGE_CACHE_SIZE); > + cgroup_io_throttle(bdev, PAGE_CACHE_SIZE); > } > return ret; > }