From: Wu Fengguang <fengguang.wu@intel.com>
To: Trond Myklebust <Trond.Myklebust@netapp.com>, linux-nfs@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Jan Kara <jack@suse.cz>, Christoph Hellwig <hch@lst.de>,
Dave Chinner <david@fromorbit.com>,
Greg Thelen <gthelen@google.com>,
Minchan Kim <minchan.kim@gmail.com>,
Vivek Goyal <vgoyal@redhat.com>,
Andrea Righi <arighi@develer.com>, linux-mm <linux-mm@kvack.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: [RFC][PATCH 2/2] nfs: scale writeback threshold proportional to dirty threshold
Date: Mon, 10 Oct 2011 21:11:54 +0800 [thread overview]
Message-ID: <20111010131154.GB16847@localhost> (raw)
In-Reply-To: <20111010131051.GA16847@localhost>
nfs_congestion_kb is to control the max allowed writeback and in-commit
pages. It's not reasonable for them to outnumber dirty and to-commit
pages. So each of them should not take more than 1/4 dirty threshold.
Considering that nfs_init_writepagecache() is called on fresh boot,
at the time dirty_thresh is much higher than the real dirty limit after
lots of user space memory consumptions, use 1/8 instead.
We might update nfs_congestion_kb when global dirty limit is changed
at runtime, but whatever, do it simple first.
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
fs/nfs/write.c | 52 ++++++++++++++++++++++++++++--------------
mm/page-writeback.c | 6 ++++
2 files changed, 41 insertions(+), 17 deletions(-)
--- linux-next.orig/fs/nfs/write.c 2011-10-09 21:36:22.000000000 +0800
+++ linux-next/fs/nfs/write.c 2011-10-10 21:05:07.000000000 +0800
@@ -1775,61 +1775,79 @@ int nfs_migrate_page(struct address_spac
set_page_private(newpage, (unsigned long)req);
ClearPagePrivate(page);
set_page_private(page, 0);
spin_unlock(&mapping->host->i_lock);
page_cache_release(page);
out_unlock:
nfs_clear_page_tag_locked(req);
out:
return ret;
}
#endif
-int __init nfs_init_writepagecache(void)
+void nfs_update_congestion_thresh(void)
{
- nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_write_data),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
- if (nfs_wdata_cachep == NULL)
- return -ENOMEM;
-
- nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
- nfs_wdata_cachep);
- if (nfs_wdata_mempool == NULL)
- return -ENOMEM;
-
- nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
- nfs_wdata_cachep);
- if (nfs_commit_mempool == NULL)
- return -ENOMEM;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
/*
* NFS congestion size, scale with available memory.
*
* 64MB: 8192k
* 128MB: 11585k
* 256MB: 16384k
* 512MB: 23170k
* 1GB: 32768k
* 2GB: 46340k
* 4GB: 65536k
* 8GB: 92681k
* 16GB: 131072k
*
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
if (nfs_congestion_kb > 256*1024)
nfs_congestion_kb = 256*1024;
+ /*
+ * Limit to 1/8 dirty threshold, so that writeback+in_commit pages
+ * won't overnumber dirty+to_commit pages.
+ */
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ dirty_thresh <<= PAGE_SHIFT - 10;
+
+ if (nfs_congestion_kb > dirty_thresh / 8)
+ nfs_congestion_kb = dirty_thresh / 8;
+}
+
+int __init nfs_init_writepagecache(void)
+{
+ nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+ sizeof(struct nfs_write_data),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_wdata_cachep == NULL)
+ return -ENOMEM;
+
+ nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+ nfs_wdata_cachep);
+ if (nfs_wdata_mempool == NULL)
+ return -ENOMEM;
+
+ nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+ nfs_wdata_cachep);
+ if (nfs_commit_mempool == NULL)
+ return -ENOMEM;
+
+ nfs_update_congestion_thresh();
+
return 0;
}
void nfs_destroy_writepagecache(void)
{
mempool_destroy(nfs_commit_mempool);
mempool_destroy(nfs_wdata_mempool);
kmem_cache_destroy(nfs_wdata_cachep);
}
--- linux-next.orig/mm/page-writeback.c 2011-10-09 21:36:06.000000000 +0800
+++ linux-next/mm/page-writeback.c 2011-10-10 21:05:07.000000000 +0800
@@ -138,34 +138,39 @@ static struct prop_descriptor vm_dirties
static int calc_period_shift(void)
{
unsigned long dirty_total;
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
+void __weak nfs_update_congestion_thresh(void)
+{
+}
+
/*
* update the period when the dirty threshold changes.
*/
static void update_completion_period(void)
{
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
prop_change_shift(&vm_dirties, shift);
writeback_set_ratelimit();
+ nfs_update_congestion_thresh();
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
@@ -438,24 +443,25 @@ unsigned long bdi_dirty_limit(struct bac
bdi_writeout_fraction(bdi, &numerator, &denominator);
bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
bdi_dirty *= numerator;
do_div(bdi_dirty, denominator);
bdi_dirty += (dirty * bdi->min_ratio) / 100;
if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
bdi_dirty = dirty * bdi->max_ratio / 100;
return bdi_dirty;
}
+EXPORT_SYMBOL_GPL(global_dirty_limits);
/*
* Dirty position control.
*
* (o) global/bdi setpoints
*
* We want the dirty pages be balanced around the global/bdi setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
*
* pos_ratio = 1 << RATELIMIT_CALC_SHIFT
WARNING: multiple messages have this Message-ID (diff)
From: Wu Fengguang <fengguang.wu@intel.com>
To: Trond Myklebust <Trond.Myklebust@netapp.com>, linux-nfs@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Jan Kara <jack@suse.cz>, Christoph Hellwig <hch@lst.de>,
Dave Chinner <david@fromorbit.com>,
Greg Thelen <gthelen@google.com>,
Minchan Kim <minchan.kim@gmail.com>,
Vivek Goyal <vgoyal@redhat.com>,
Andrea Righi <arighi@develer.com>, linux-mm <linux-mm@kvack.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: [RFC][PATCH 2/2] nfs: scale writeback threshold proportional to dirty threshold
Date: Mon, 10 Oct 2011 21:11:54 +0800 [thread overview]
Message-ID: <20111010131154.GB16847@localhost> (raw)
In-Reply-To: <20111010131051.GA16847@localhost>
nfs_congestion_kb is to control the max allowed writeback and in-commit
pages. It's not reasonable for them to outnumber dirty and to-commit
pages. So each of them should not take more than 1/4 dirty threshold.
Considering that nfs_init_writepagecache() is called on fresh boot,
at the time dirty_thresh is much higher than the real dirty limit after
lots of user space memory consumptions, use 1/8 instead.
We might update nfs_congestion_kb when global dirty limit is changed
at runtime, but whatever, do it simple first.
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
fs/nfs/write.c | 52 ++++++++++++++++++++++++++++--------------
mm/page-writeback.c | 6 ++++
2 files changed, 41 insertions(+), 17 deletions(-)
--- linux-next.orig/fs/nfs/write.c 2011-10-09 21:36:22.000000000 +0800
+++ linux-next/fs/nfs/write.c 2011-10-10 21:05:07.000000000 +0800
@@ -1775,61 +1775,79 @@ int nfs_migrate_page(struct address_spac
set_page_private(newpage, (unsigned long)req);
ClearPagePrivate(page);
set_page_private(page, 0);
spin_unlock(&mapping->host->i_lock);
page_cache_release(page);
out_unlock:
nfs_clear_page_tag_locked(req);
out:
return ret;
}
#endif
-int __init nfs_init_writepagecache(void)
+void nfs_update_congestion_thresh(void)
{
- nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_write_data),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
- if (nfs_wdata_cachep == NULL)
- return -ENOMEM;
-
- nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
- nfs_wdata_cachep);
- if (nfs_wdata_mempool == NULL)
- return -ENOMEM;
-
- nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
- nfs_wdata_cachep);
- if (nfs_commit_mempool == NULL)
- return -ENOMEM;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
/*
* NFS congestion size, scale with available memory.
*
* 64MB: 8192k
* 128MB: 11585k
* 256MB: 16384k
* 512MB: 23170k
* 1GB: 32768k
* 2GB: 46340k
* 4GB: 65536k
* 8GB: 92681k
* 16GB: 131072k
*
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
if (nfs_congestion_kb > 256*1024)
nfs_congestion_kb = 256*1024;
+ /*
+ * Limit to 1/8 dirty threshold, so that writeback+in_commit pages
+ * won't overnumber dirty+to_commit pages.
+ */
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ dirty_thresh <<= PAGE_SHIFT - 10;
+
+ if (nfs_congestion_kb > dirty_thresh / 8)
+ nfs_congestion_kb = dirty_thresh / 8;
+}
+
+int __init nfs_init_writepagecache(void)
+{
+ nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+ sizeof(struct nfs_write_data),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_wdata_cachep == NULL)
+ return -ENOMEM;
+
+ nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+ nfs_wdata_cachep);
+ if (nfs_wdata_mempool == NULL)
+ return -ENOMEM;
+
+ nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+ nfs_wdata_cachep);
+ if (nfs_commit_mempool == NULL)
+ return -ENOMEM;
+
+ nfs_update_congestion_thresh();
+
return 0;
}
void nfs_destroy_writepagecache(void)
{
mempool_destroy(nfs_commit_mempool);
mempool_destroy(nfs_wdata_mempool);
kmem_cache_destroy(nfs_wdata_cachep);
}
--- linux-next.orig/mm/page-writeback.c 2011-10-09 21:36:06.000000000 +0800
+++ linux-next/mm/page-writeback.c 2011-10-10 21:05:07.000000000 +0800
@@ -138,34 +138,39 @@ static struct prop_descriptor vm_dirties
static int calc_period_shift(void)
{
unsigned long dirty_total;
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
+void __weak nfs_update_congestion_thresh(void)
+{
+}
+
/*
* update the period when the dirty threshold changes.
*/
static void update_completion_period(void)
{
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
prop_change_shift(&vm_dirties, shift);
writeback_set_ratelimit();
+ nfs_update_congestion_thresh();
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
@@ -438,24 +443,25 @@ unsigned long bdi_dirty_limit(struct bac
bdi_writeout_fraction(bdi, &numerator, &denominator);
bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
bdi_dirty *= numerator;
do_div(bdi_dirty, denominator);
bdi_dirty += (dirty * bdi->min_ratio) / 100;
if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
bdi_dirty = dirty * bdi->max_ratio / 100;
return bdi_dirty;
}
+EXPORT_SYMBOL_GPL(global_dirty_limits);
/*
* Dirty position control.
*
* (o) global/bdi setpoints
*
* We want the dirty pages be balanced around the global/bdi setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
*
* pos_ratio = 1 << RATELIMIT_CALC_SHIFT
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2011-10-10 13:12 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-10-03 13:42 [PATCH 00/11] IO-less dirty throttling v12 Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 01/11] writeback: account per-bdi accumulated dirtied pages Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 02/11] writeback: dirty position control Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 03/11] writeback: add bg_threshold parameter to __bdi_update_bandwidth() Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 04/11] writeback: dirty rate control Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 05/11] writeback: stabilize bdi->dirty_ratelimit Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 06/11] writeback: per task dirty rate limit Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 07/11] writeback: IO-less balance_dirty_pages() Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 08/11] writeback: limit max dirty pause time Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 09/11] writeback: control " Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 10/11] writeback: dirty position control - bdi reserve area Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` [PATCH 11/11] writeback: per-bdi background threshold Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:42 ` Wu Fengguang
2011-10-03 13:59 ` [PATCH 00/11] IO-less dirty throttling v12 Wu Fengguang
2011-10-03 13:59 ` Wu Fengguang
2011-10-05 1:42 ` Wu Fengguang
2011-10-05 1:42 ` Wu Fengguang
2011-10-04 19:52 ` Vivek Goyal
2011-10-04 19:52 ` Vivek Goyal
2011-10-05 13:56 ` Wu Fengguang
2011-10-05 13:56 ` Wu Fengguang
2011-10-05 15:16 ` Andi Kleen
2011-10-05 15:16 ` Andi Kleen
2011-10-10 12:14 ` Peter Zijlstra
2011-10-10 12:14 ` Peter Zijlstra
2011-10-10 13:07 ` Wu Fengguang
2011-10-10 13:07 ` Wu Fengguang
2011-10-10 13:10 ` [RFC][PATCH 1/2] nfs: writeback pages wait queue Wu Fengguang
2011-10-10 13:10 ` Wu Fengguang
2011-10-10 13:11 ` Wu Fengguang [this message]
2011-10-10 13:11 ` [RFC][PATCH 2/2] nfs: scale writeback threshold proportional to dirty threshold Wu Fengguang
2011-10-18 8:53 ` Wu Fengguang
2011-10-18 8:53 ` Wu Fengguang
2011-10-18 8:53 ` Wu Fengguang
2011-10-18 8:59 ` Wu Fengguang
2011-10-18 8:59 ` Wu Fengguang
2011-10-18 8:59 ` Wu Fengguang
2011-10-20 2:49 ` Wu Fengguang
2011-10-20 2:49 ` Wu Fengguang
2011-10-18 8:51 ` [RFC][PATCH 1/2] nfs: writeback pages wait queue Wu Fengguang
2011-10-18 8:51 ` Wu Fengguang
2011-10-20 3:59 ` Wu Fengguang
2011-10-20 3:59 ` Wu Fengguang
2011-10-10 14:28 ` [PATCH 00/11] IO-less dirty throttling v12 Wu Fengguang
2011-10-10 14:28 ` Wu Fengguang
2011-10-17 3:03 ` Wu Fengguang
2011-10-17 3:03 ` Wu Fengguang
2011-10-20 3:39 ` Wu Fengguang
2011-10-20 3:39 ` Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111010131154.GB16847@localhost \
--to=fengguang.wu@intel.com \
--cc=Trond.Myklebust@netapp.com \
--cc=a.p.zijlstra@chello.nl \
--cc=akpm@linux-foundation.org \
--cc=arighi@develer.com \
--cc=david@fromorbit.com \
--cc=gthelen@google.com \
--cc=hch@lst.de \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nfs@vger.kernel.org \
--cc=minchan.kim@gmail.com \
--cc=vgoyal@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.