* [PATCH] improve the performance of large sequential write NFS workloads
@ 2009-12-17 2:03 Steve Rago
2009-12-17 8:17 ` Peter Zijlstra
0 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-17 2:03 UTC (permalink / raw)
To: linux-nfs; +Cc: linux-kernel, Trond.Myklebust
Eager Writeback for NFS Clients
-------------------------------
Prevent applications that write large sequential streams of data (like backup, for example)
from entering into a memory pressure state, which degrades performance by falling back to
synchronous operations (both synchronous writes and additional commits). This is accomplished
by preventing the client application from dirtying pages faster than they can be written to
the server: clients write pages eagerly instead of lazily.
The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be
in flight for a given file. This is used to block the application from dirtying more pages
until the writes are complete.
This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach. For
the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.
The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.
Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
workloads (where "large" is defined to be greater than the memory size on the client).
Signed-off-by: Steve Rago <sar@nec-labs.com>
---
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c
--- linux-2.6.32-rc7-orig/fs/fs-writeback.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/fs-writeback.c 2009-11-30 15:36:30.735453450 -0500
@@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ
wbc.range_start = 0;
wbc.range_end = LLONG_MAX;
}
+ if (args->for_background || wbc.for_kupdate)
+ wbc.nonblocking = 1;
for (;;) {
/*
@@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru
unsigned long expired;
long nr_pages;
+ if (dirty_writeback_interval == 0)
+ return 0;
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
@@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb
break;
}
- wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (dirty_writeback_interval == 0)
+ wait_jiffies = msecs_to_jiffies(5000); /* default */
+ else
+ wait_jiffies =
+ msecs_to_jiffies(dirty_writeback_interval * 10);
schedule_timeout_interruptible(wait_jiffies);
try_to_freeze();
}
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c
--- linux-2.6.32-rc7-orig/fs/nfs/file.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/file.c 2009-11-30 15:21:22.635101295 -0500
@@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi
return 0;
}
+static int nfs_is_seqwrite(struct inode *inode, loff_t pos)
+{
+ return NFS_I(inode)->wrpos == pos;
+}
+
static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
ssize_t result;
size_t count = iov_length(iov, nr_segs);
@@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio
result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
+
+ result = count;
+ if (!count)
+ goto out;
+ nfs_wait_woutstanding(inode);
+
/*
* O_APPEND implies that we must revalidate the file length.
*/
@@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
/* Return error values for O_SYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
- int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
- if (err < 0)
- result = err;
+ if (result >= 0) {
+ if (nfs_need_sync_write(iocb->ki_filp, inode)) {
+ int err;
+
+ err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
+ inode);
+ if (err < 0)
+ result = err;
+ } else if (nfs_max_woutstanding != 0 &&
+ nfs_is_seqwrite(inode, pos) &&
+ atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
+ nfs_wb_eager(inode);
+ }
+ if (result > 0)
+ nfsi->wrpos = pos + result;
}
out:
return result;
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c
--- linux-2.6.32-rc7-orig/fs/nfs/inode.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/inode.c 2009-11-13 11:36:43.888410914 -0500
@@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
+ int woutstanding = nfs_max_woutstanding;
int err;
/*
@@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st
* nfs_wb_nocommit.
*/
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
+ atomic_add(woutstanding, &nfsi->writes);
nfs_wb_nocommit(inode);
- mutex_unlock(&inode->i_mutex);
}
/*
@@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
+
+ if (S_ISREG(inode->i_mode)) {
+ atomic_sub(woutstanding, &nfsi->writes);
+ wake_up(&nfsi->writes_wq);
+ }
return err;
}
@@ -1418,9 +1424,13 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
nfsi->npages = 0;
+ atomic_set(&nfsi->ndirty, 0);
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
init_waitqueue_head(&nfsi->waitqueue);
+ atomic_set(&nfsi->writes, 0);
+ init_waitqueue_head(&nfsi->writes_wq);
+ nfsi->wrpos = 0;
nfs4_init_once(nfsi);
}
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c
--- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/sysctl.c 2009-11-13 11:36:43.895459044 -0500
@@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nfs_max_woutstanding",
+ .data = &nfs_max_woutstanding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c
--- linux-2.6.32-rc7-orig/fs/nfs/write.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/write.c 2009-12-08 13:26:35.416629518 -0500
@@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag
static int wb_priority(struct writeback_control *wbc)
{
+ if (nfs_max_woutstanding != 0)
+ return 0;
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
if (wbc->for_kupdate)
@@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct
if (!ret) {
struct inode *inode = page->mapping->host;
struct nfs_server *nfss = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ atomic_dec(&nfsi->ndirty);
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH) {
set_bdi_congested(&nfss->backing_dev_info,
@@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc
return ret;
}
+int nfs_max_woutstanding = 16;
+
+static void nfs_inc_woutstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ atomic_inc(&nfsi->writes);
+}
+
+static void nfs_dec_woutstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding)
+ wake_up(&nfsi->writes_wq);
+}
+
+void nfs_wait_woutstanding(struct inode *inode)
+{
+ if (nfs_max_woutstanding != 0) {
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ long npages;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ npages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK);
+ if (npages >= background_thresh)
+ wait_event(nfsi->writes_wq,
+ atomic_read(&nfsi->writes) < nfs_max_woutstanding);
+ }
+}
+
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
@@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ atomic_inc(&nfsi->ndirty);
__set_page_dirty_nobuffers(req->wb_page);
}
@@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
- goto out;
+ return req;
req = nfs_create_request(ctx, inode, page, offset, bytes);
if (IS_ERR(req))
- goto out;
+ return req;
error = nfs_inode_add_request(inode, req);
if (error != 0) {
nfs_release_request(req);
req = ERR_PTR(error);
+ } else {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ atomic_inc(&nfsi->ndirty);
}
-out:
return req;
}
@@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs
count,
(unsigned long long)data->args.offset);
+ nfs_inc_woutstanding(inode);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task *
*/
status = NFS_PROTO(data->inode)->write_done(task, data);
if (status != 0)
- return status;
+ goto out;
nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task *
task->tk_status = -EIO;
}
nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
+out:
+ nfs_dec_woutstanding(data->inode);
return 0;
}
@@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str
return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
}
+int nfs_wb_eager(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control wbc = {
+ .bdi = mapping->backing_dev_info,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ int ret;
+
+ ret = nfs_writepages(mapping, &wbc);
+ if (ret < 0)
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ return ret;
+}
+
#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page)
@@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void)
mempool_destroy(nfs_wdata_mempool);
kmem_cache_destroy(nfs_wdata_cachep);
}
-
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h
--- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/include/linux/nfs_fs.h 2009-11-13 11:36:43.982136105 -0500
@@ -166,6 +166,7 @@ struct nfs_inode {
struct radix_tree_root nfs_page_tree;
unsigned long npages;
+ atomic_t ndirty;
/* Open contexts for shared mmap writes */
struct list_head open_files;
@@ -187,6 +188,11 @@ struct nfs_inode {
#ifdef CONFIG_NFS_FSCACHE
struct fscache_cookie *fscache;
#endif
+
+ loff_t wrpos;
+ atomic_t writes;
+ wait_queue_head_t writes_wq;
+
struct inode vfs_inode;
};
@@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru
* linux/fs/nfs/write.c
*/
extern int nfs_congestion_kb;
+extern int nfs_max_woutstanding;
extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
extern int nfs_writepages(struct address_space *, struct writeback_control *);
extern int nfs_flush_incompatible(struct file *file, struct page *page);
extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern void nfs_wait_woutstanding(struct inode *);
/*
* Try to write back everything synchronously (but check the
@@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod
extern int nfs_wb_nocommit(struct inode *inode);
extern int nfs_wb_page(struct inode *inode, struct page* page);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
+extern int nfs_wb_eager(struct inode *inode);
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
extern int nfs_commit_inode(struct inode *, int);
extern struct nfs_write_data *nfs_commitdata_alloc(void);
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c
--- linux-2.6.32-rc7-orig/mm/page-writeback.c 2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/mm/page-writeback.c 2009-11-18 10:05:22.314373138 -0500
@@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a
* threshold otherwise wait until the disk writes catch
* up.
*/
- if (bdi_nr_reclaimable > bdi_thresh) {
+ if (bdi_nr_reclaimable != 0) {
writeback_inodes_wbc(&wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
^ permalink raw reply [flat|nested] 96+ messages in thread* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-17 2:03 [PATCH] improve the performance of large sequential write NFS workloads Steve Rago @ 2009-12-17 8:17 ` Peter Zijlstra 2009-12-18 19:33 ` Steve Rago 2009-12-19 12:20 ` Wu Fengguang 0 siblings, 2 replies; 96+ messages in thread From: Peter Zijlstra @ 2009-12-17 8:17 UTC (permalink / raw) To: Steve Rago Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > Eager Writeback for NFS Clients > ------------------------------- > Prevent applications that write large sequential streams of data (like backup, for example) > from entering into a memory pressure state, which degrades performance by falling back to > synchronous operations (both synchronous writes and additional commits). This is accomplished > by preventing the client application from dirtying pages faster than they can be written to > the server: clients write pages eagerly instead of lazily. > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables > the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be > in flight for a given file. This is used to block the application from dirtying more pages > until the writes are complete. > > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach. For > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323. > > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32. > > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar). > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet. > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential > workloads (where "large" is defined to be greater than the memory size on the client). > > Signed-off-by: Steve Rago <sar@nec-labs.com> > --- > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c > --- linux-2.6.32-rc7-orig/fs/fs-writeback.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/fs/fs-writeback.c 2009-11-30 15:36:30.735453450 -0500 > @@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ > wbc.range_start = 0; > wbc.range_end = LLONG_MAX; > } > + if (args->for_background || wbc.for_kupdate) > + wbc.nonblocking = 1; > > for (;;) { > /* > @@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru > unsigned long expired; > long nr_pages; > > + if (dirty_writeback_interval == 0) > + return 0; > expired = wb->last_old_flush + > msecs_to_jiffies(dirty_writeback_interval * 10); > if (time_before(jiffies, expired)) > @@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb > break; > } > > - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); > + if (dirty_writeback_interval == 0) > + wait_jiffies = msecs_to_jiffies(5000); /* default */ > + else > + wait_jiffies = > + msecs_to_jiffies(dirty_writeback_interval * 10); I'm not up-to-date on the bdi-writeout stuff, but this just looks wrong. > schedule_timeout_interruptible(wait_jiffies); > try_to_freeze(); > } > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c > --- linux-2.6.32-rc7-orig/fs/nfs/file.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/fs/nfs/file.c 2009-11-30 15:21:22.635101295 -0500 > @@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi > return 0; > } > > +static int nfs_is_seqwrite(struct inode *inode, loff_t pos) > +{ > + return NFS_I(inode)->wrpos == pos; > +} > + > static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, > unsigned long nr_segs, loff_t pos) > { > struct dentry * dentry = iocb->ki_filp->f_path.dentry; > struct inode * inode = dentry->d_inode; > + struct nfs_inode *nfsi = NFS_I(inode); > ssize_t result; > size_t count = iov_length(iov, nr_segs); > > @@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio > result = -EBUSY; > if (IS_SWAPFILE(inode)) > goto out_swapfile; > + > + result = count; > + if (!count) > + goto out; > + nfs_wait_woutstanding(inode); > + > /* > * O_APPEND implies that we must revalidate the file length. > */ > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio > nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); > result = generic_file_aio_write(iocb, iov, nr_segs, pos); > /* Return error values for O_SYNC and IS_SYNC() */ > - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { > - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); > - if (err < 0) > - result = err; > + if (result >= 0) { > + if (nfs_need_sync_write(iocb->ki_filp, inode)) { > + int err; > + > + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), > + inode); > + if (err < 0) > + result = err; > + } else if (nfs_max_woutstanding != 0 && > + nfs_is_seqwrite(inode, pos) && > + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) { > + nfs_wb_eager(inode); > + } > + if (result > 0) > + nfsi->wrpos = pos + result; > } > out: > return result; > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c > --- linux-2.6.32-rc7-orig/fs/nfs/inode.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/fs/nfs/inode.c 2009-11-13 11:36:43.888410914 -0500 > @@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino > int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) > { > struct inode *inode = dentry->d_inode; > - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; > + struct nfs_inode *nfsi = NFS_I(inode); > + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME; > + int woutstanding = nfs_max_woutstanding; > int err; > > /* > @@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st > * nfs_wb_nocommit. > */ > if (S_ISREG(inode->i_mode)) { > - mutex_lock(&inode->i_mutex); > + atomic_add(woutstanding, &nfsi->writes); > nfs_wb_nocommit(inode); > - mutex_unlock(&inode->i_mutex); > } > > /* > @@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st > generic_fillattr(inode, stat); > stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); > } > + > + if (S_ISREG(inode->i_mode)) { > + atomic_sub(woutstanding, &nfsi->writes); > + wake_up(&nfsi->writes_wq); > + } > return err; > } > > @@ -1418,9 +1424,13 @@ static void init_once(void *foo) > INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); > INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); > nfsi->npages = 0; > + atomic_set(&nfsi->ndirty, 0); > atomic_set(&nfsi->silly_count, 1); > INIT_HLIST_HEAD(&nfsi->silly_list); > init_waitqueue_head(&nfsi->waitqueue); > + atomic_set(&nfsi->writes, 0); > + init_waitqueue_head(&nfsi->writes_wq); > + nfsi->wrpos = 0; > nfs4_init_once(nfsi); > } > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c > --- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/fs/nfs/sysctl.c 2009-11-13 11:36:43.895459044 -0500 > @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = { > .mode = 0644, > .proc_handler = &proc_dointvec, > }, > + { > + .ctl_name = CTL_UNNUMBERED, > + .procname = "nfs_max_woutstanding", > + .data = &nfs_max_woutstanding, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = &proc_dointvec, > + }, > { .ctl_name = 0 } > }; > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c > --- linux-2.6.32-rc7-orig/fs/nfs/write.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/fs/nfs/write.c 2009-12-08 13:26:35.416629518 -0500 > @@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag > > static int wb_priority(struct writeback_control *wbc) > { > + if (nfs_max_woutstanding != 0) > + return 0; > if (wbc->for_reclaim) > return FLUSH_HIGHPRI | FLUSH_STABLE; > if (wbc->for_kupdate) > @@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct > if (!ret) { > struct inode *inode = page->mapping->host; > struct nfs_server *nfss = NFS_SERVER(inode); > + struct nfs_inode *nfsi = NFS_I(inode); > > + atomic_dec(&nfsi->ndirty); > if (atomic_long_inc_return(&nfss->writeback) > > NFS_CONGESTION_ON_THRESH) { > set_bdi_congested(&nfss->backing_dev_info, > @@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc > return ret; > } > > +int nfs_max_woutstanding = 16; > + > +static void nfs_inc_woutstanding(struct inode *inode) > +{ > + struct nfs_inode *nfsi = NFS_I(inode); > + atomic_inc(&nfsi->writes); > +} > + > +static void nfs_dec_woutstanding(struct inode *inode) > +{ > + struct nfs_inode *nfsi = NFS_I(inode); > + if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding) > + wake_up(&nfsi->writes_wq); > +} > + > +void nfs_wait_woutstanding(struct inode *inode) > +{ > + if (nfs_max_woutstanding != 0) { > + unsigned long background_thresh; > + unsigned long dirty_thresh; > + long npages; > + struct nfs_inode *nfsi = NFS_I(inode); > + > + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > + npages = global_page_state(NR_FILE_DIRTY) + > + global_page_state(NR_UNSTABLE_NFS) + > + global_page_state(NR_WRITEBACK); > + if (npages >= background_thresh) > + wait_event(nfsi->writes_wq, > + atomic_read(&nfsi->writes) < nfs_max_woutstanding); > + } > +} This looks utterly busted too, why the global state and not the nfs client's bdi state? Also, why create this extra workqueue and not simply use the congestion interface that is present? If the congestion stuff doesn't work for you, fix that, don't add extra muck like this. > + > int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) > { > struct inode *inode = mapping->host; > @@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str > static void > nfs_mark_request_dirty(struct nfs_page *req) > { > + struct inode *inode = req->wb_context->path.dentry->d_inode; > + struct nfs_inode *nfsi = NFS_I(inode); > + atomic_inc(&nfsi->ndirty); > __set_page_dirty_nobuffers(req->wb_page); > } > > @@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write > > req = nfs_try_to_update_request(inode, page, offset, bytes); > if (req != NULL) > - goto out; > + return req; > req = nfs_create_request(ctx, inode, page, offset, bytes); > if (IS_ERR(req)) > - goto out; > + return req; > error = nfs_inode_add_request(inode, req); > if (error != 0) { > nfs_release_request(req); > req = ERR_PTR(error); > + } else { > + struct nfs_inode *nfsi = NFS_I(inode); > + atomic_inc(&nfsi->ndirty); > } > -out: > return req; > } > > @@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs > count, > (unsigned long long)data->args.offset); > > + nfs_inc_woutstanding(inode); > task = rpc_run_task(&task_setup_data); > if (IS_ERR(task)) > return PTR_ERR(task); > @@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task * > */ > status = NFS_PROTO(data->inode)->write_done(task, data); > if (status != 0) > - return status; > + goto out; > nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); > > #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) > @@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task * > task->tk_status = -EIO; > } > nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res); > +out: > + nfs_dec_woutstanding(data->inode); > return 0; > } > > @@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str > return nfs_wb_page_priority(inode, page, FLUSH_STABLE); > } > > +int nfs_wb_eager(struct inode *inode) > +{ > + struct address_space *mapping = inode->i_mapping; > + struct writeback_control wbc = { > + .bdi = mapping->backing_dev_info, > + .sync_mode = WB_SYNC_NONE, > + .nr_to_write = LONG_MAX, > + .range_start = 0, > + .range_end = LLONG_MAX, > + }; > + int ret; > + > + ret = nfs_writepages(mapping, &wbc); > + if (ret < 0) > + __mark_inode_dirty(inode, I_DIRTY_PAGES); > + return ret; > +} > + > #ifdef CONFIG_MIGRATION > int nfs_migrate_page(struct address_space *mapping, struct page *newpage, > struct page *page) > @@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void) > mempool_destroy(nfs_wdata_mempool); > kmem_cache_destroy(nfs_wdata_cachep); > } > - > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h > --- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/include/linux/nfs_fs.h 2009-11-13 11:36:43.982136105 -0500 > @@ -166,6 +166,7 @@ struct nfs_inode { > struct radix_tree_root nfs_page_tree; > > unsigned long npages; > + atomic_t ndirty; > > /* Open contexts for shared mmap writes */ > struct list_head open_files; > @@ -187,6 +188,11 @@ struct nfs_inode { > #ifdef CONFIG_NFS_FSCACHE > struct fscache_cookie *fscache; > #endif > + > + loff_t wrpos; > + atomic_t writes; > + wait_queue_head_t writes_wq; > + > struct inode vfs_inode; > }; > > @@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru > * linux/fs/nfs/write.c > */ > extern int nfs_congestion_kb; > +extern int nfs_max_woutstanding; > extern int nfs_writepage(struct page *page, struct writeback_control *wbc); > extern int nfs_writepages(struct address_space *, struct writeback_control *); > extern int nfs_flush_incompatible(struct file *file, struct page *page); > extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); > extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); > +extern void nfs_wait_woutstanding(struct inode *); > > /* > * Try to write back everything synchronously (but check the > @@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod > extern int nfs_wb_nocommit(struct inode *inode); > extern int nfs_wb_page(struct inode *inode, struct page* page); > extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); > +extern int nfs_wb_eager(struct inode *inode); > #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) > extern int nfs_commit_inode(struct inode *, int); > extern struct nfs_write_data *nfs_commitdata_alloc(void); > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c > --- linux-2.6.32-rc7-orig/mm/page-writeback.c 2009-11-12 19:46:07.000000000 -0500 > +++ linux-2.6.32-rc7/mm/page-writeback.c 2009-11-18 10:05:22.314373138 -0500 > @@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a > * threshold otherwise wait until the disk writes catch > * up. > */ > - if (bdi_nr_reclaimable > bdi_thresh) { > + if (bdi_nr_reclaimable != 0) { > writeback_inodes_wbc(&wbc); > pages_written += write_chunk - wbc.nr_to_write; > get_dirty_limits(&background_thresh, &dirty_thresh, And I think you just broke regular writeback here, allowing for tons of unneeded writeout of very small chunks. This really needs to be multiple patches, and a proper changelog describing why you do things. The above, because my benchmark goes faster, just isn't sufficient. Also, I don't think this needs to have a sysctl, it should just work. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-17 8:17 ` Peter Zijlstra @ 2009-12-18 19:33 ` Steve Rago 2009-12-18 19:41 ` Ingo Molnar 2009-12-18 19:44 ` Peter Zijlstra 2009-12-19 12:20 ` Wu Fengguang 1 sibling, 2 replies; 96+ messages in thread From: Steve Rago @ 2009-12-18 19:33 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Thu, 2009-12-17 at 09:17 +0100, Peter Zijlstra wrote: > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > Eager Writeback for NFS Clients > > ------------------------------- > > Prevent applications that write large sequential streams of data (like backup, for example) > > from entering into a memory pressure state, which degrades performance by falling back to > > synchronous operations (both synchronous writes and additional commits). This is accomplished > > by preventing the client application from dirtying pages faster than they can be written to > > the server: clients write pages eagerly instead of lazily. > > > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables > > the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be > > in flight for a given file. This is used to block the application from dirtying more pages > > until the writes are complete. > > > > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach. For > > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323. > > > > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32. > > > > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar). > > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet. > > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential > > workloads (where "large" is defined to be greater than the memory size on the client). > > > > Signed-off-by: Steve Rago <sar@nec-labs.com> > > --- > > > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c > > --- linux-2.6.32-rc7-orig/fs/fs-writeback.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/fs/fs-writeback.c 2009-11-30 15:36:30.735453450 -0500 > > @@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ > > wbc.range_start = 0; > > wbc.range_end = LLONG_MAX; > > } > > + if (args->for_background || wbc.for_kupdate) > > + wbc.nonblocking = 1; > > > > for (;;) { > > /* > > @@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru > > unsigned long expired; > > long nr_pages; > > > > + if (dirty_writeback_interval == 0) > > + return 0; > > expired = wb->last_old_flush + > > msecs_to_jiffies(dirty_writeback_interval * 10); > > if (time_before(jiffies, expired)) > > @@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb > > break; > > } > > > > - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); > > + if (dirty_writeback_interval == 0) > > + wait_jiffies = msecs_to_jiffies(5000); /* default */ > > + else > > + wait_jiffies = > > + msecs_to_jiffies(dirty_writeback_interval * 10); > > I'm not up-to-date on the bdi-writeout stuff, but this just looks wrong. The old behavior (before 2.6.32) allowed one to disable kupdate-style periodic writeback by setting /proc/sys/vm/dirty_writeback_centisecs to 0. In 2.6.32, the bdi flushing and kupdate-style flushing are done by the same thread, so these changes restore that behavior. kupdate-style can interfere with eager writeback by generating smaller writes and more commits. > > > schedule_timeout_interruptible(wait_jiffies); > > try_to_freeze(); > > } > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c > > --- linux-2.6.32-rc7-orig/fs/nfs/file.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/fs/nfs/file.c 2009-11-30 15:21:22.635101295 -0500 > > @@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi > > return 0; > > } > > > > +static int nfs_is_seqwrite(struct inode *inode, loff_t pos) > > +{ > > + return NFS_I(inode)->wrpos == pos; > > +} > > + > > static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, > > unsigned long nr_segs, loff_t pos) > > { > > struct dentry * dentry = iocb->ki_filp->f_path.dentry; > > struct inode * inode = dentry->d_inode; > > + struct nfs_inode *nfsi = NFS_I(inode); > > ssize_t result; > > size_t count = iov_length(iov, nr_segs); > > > > @@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio > > result = -EBUSY; > > if (IS_SWAPFILE(inode)) > > goto out_swapfile; > > + > > + result = count; > > + if (!count) > > + goto out; > > + nfs_wait_woutstanding(inode); > > + > > /* > > * O_APPEND implies that we must revalidate the file length. > > */ > > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio > > nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); > > result = generic_file_aio_write(iocb, iov, nr_segs, pos); > > /* Return error values for O_SYNC and IS_SYNC() */ > > - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { > > - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); > > - if (err < 0) > > - result = err; > > + if (result >= 0) { > > + if (nfs_need_sync_write(iocb->ki_filp, inode)) { > > + int err; > > + > > + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), > > + inode); > > + if (err < 0) > > + result = err; > > + } else if (nfs_max_woutstanding != 0 && > > + nfs_is_seqwrite(inode, pos) && > > + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) { > > + nfs_wb_eager(inode); > > + } > > + if (result > 0) > > + nfsi->wrpos = pos + result; > > } > > out: > > return result; > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c > > --- linux-2.6.32-rc7-orig/fs/nfs/inode.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/fs/nfs/inode.c 2009-11-13 11:36:43.888410914 -0500 > > @@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino > > int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) > > { > > struct inode *inode = dentry->d_inode; > > - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; > > + struct nfs_inode *nfsi = NFS_I(inode); > > + int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME; > > + int woutstanding = nfs_max_woutstanding; > > int err; > > > > /* > > @@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st > > * nfs_wb_nocommit. > > */ > > if (S_ISREG(inode->i_mode)) { > > - mutex_lock(&inode->i_mutex); > > + atomic_add(woutstanding, &nfsi->writes); > > nfs_wb_nocommit(inode); > > - mutex_unlock(&inode->i_mutex); > > } > > > > /* > > @@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st > > generic_fillattr(inode, stat); > > stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); > > } > > + > > + if (S_ISREG(inode->i_mode)) { > > + atomic_sub(woutstanding, &nfsi->writes); > > + wake_up(&nfsi->writes_wq); > > + } > > return err; > > } > > > > @@ -1418,9 +1424,13 @@ static void init_once(void *foo) > > INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); > > INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); > > nfsi->npages = 0; > > + atomic_set(&nfsi->ndirty, 0); > > atomic_set(&nfsi->silly_count, 1); > > INIT_HLIST_HEAD(&nfsi->silly_list); > > init_waitqueue_head(&nfsi->waitqueue); > > + atomic_set(&nfsi->writes, 0); > > + init_waitqueue_head(&nfsi->writes_wq); > > + nfsi->wrpos = 0; > > nfs4_init_once(nfsi); > > } > > > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c > > --- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/fs/nfs/sysctl.c 2009-11-13 11:36:43.895459044 -0500 > > @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = { > > .mode = 0644, > > .proc_handler = &proc_dointvec, > > }, > > + { > > + .ctl_name = CTL_UNNUMBERED, > > + .procname = "nfs_max_woutstanding", > > + .data = &nfs_max_woutstanding, > > + .maxlen = sizeof(int), > > + .mode = 0644, > > + .proc_handler = &proc_dointvec, > > + }, > > { .ctl_name = 0 } > > }; > > > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c > > --- linux-2.6.32-rc7-orig/fs/nfs/write.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/fs/nfs/write.c 2009-12-08 13:26:35.416629518 -0500 > > @@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag > > > > static int wb_priority(struct writeback_control *wbc) > > { > > + if (nfs_max_woutstanding != 0) > > + return 0; > > if (wbc->for_reclaim) > > return FLUSH_HIGHPRI | FLUSH_STABLE; > > if (wbc->for_kupdate) > > @@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct > > if (!ret) { > > struct inode *inode = page->mapping->host; > > struct nfs_server *nfss = NFS_SERVER(inode); > > + struct nfs_inode *nfsi = NFS_I(inode); > > > > + atomic_dec(&nfsi->ndirty); > > if (atomic_long_inc_return(&nfss->writeback) > > > NFS_CONGESTION_ON_THRESH) { > > set_bdi_congested(&nfss->backing_dev_info, > > @@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc > > return ret; > > } > > > > +int nfs_max_woutstanding = 16; > > + > > +static void nfs_inc_woutstanding(struct inode *inode) > > +{ > > + struct nfs_inode *nfsi = NFS_I(inode); > > + atomic_inc(&nfsi->writes); > > +} > > + > > +static void nfs_dec_woutstanding(struct inode *inode) > > +{ > > + struct nfs_inode *nfsi = NFS_I(inode); > > + if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding) > > + wake_up(&nfsi->writes_wq); > > +} > > + > > +void nfs_wait_woutstanding(struct inode *inode) > > +{ > > + if (nfs_max_woutstanding != 0) { > > + unsigned long background_thresh; > > + unsigned long dirty_thresh; > > + long npages; > > + struct nfs_inode *nfsi = NFS_I(inode); > > + > > + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > > + npages = global_page_state(NR_FILE_DIRTY) + > > + global_page_state(NR_UNSTABLE_NFS) + > > + global_page_state(NR_WRITEBACK); > > + if (npages >= background_thresh) > > + wait_event(nfsi->writes_wq, > > + atomic_read(&nfsi->writes) < nfs_max_woutstanding); > > + } > > +} > > This looks utterly busted too, why the global state and not the nfs > client's bdi state? > > Also, why create this extra workqueue and not simply use the congestion > interface that is present? If the congestion stuff doesn't work for you, > fix that, don't add extra muck like this. Pages are a global resource. Once we hit the dirty_threshold, the system is going to work harder to flush the pages out. This code prevents the caller from creating more dirty pages in this state, thereby making matters worse, when eager writeback is enabled. This wait queue is used for different purposes than the congestion_wait interface. Here we are preventing the caller from proceeding if there are too many NFS writes outstanding for this thread and we are in a memory pressure state. It has nothing to do with the state of the bdi congestion. > > > + > > int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) > > { > > struct inode *inode = mapping->host; > > @@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str > > static void > > nfs_mark_request_dirty(struct nfs_page *req) > > { > > + struct inode *inode = req->wb_context->path.dentry->d_inode; > > + struct nfs_inode *nfsi = NFS_I(inode); > > + atomic_inc(&nfsi->ndirty); > > __set_page_dirty_nobuffers(req->wb_page); > > } > > > > @@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write > > > > req = nfs_try_to_update_request(inode, page, offset, bytes); > > if (req != NULL) > > - goto out; > > + return req; > > req = nfs_create_request(ctx, inode, page, offset, bytes); > > if (IS_ERR(req)) > > - goto out; > > + return req; > > error = nfs_inode_add_request(inode, req); > > if (error != 0) { > > nfs_release_request(req); > > req = ERR_PTR(error); > > + } else { > > + struct nfs_inode *nfsi = NFS_I(inode); > > + atomic_inc(&nfsi->ndirty); > > } > > -out: > > return req; > > } > > > > @@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs > > count, > > (unsigned long long)data->args.offset); > > > > + nfs_inc_woutstanding(inode); > > task = rpc_run_task(&task_setup_data); > > if (IS_ERR(task)) > > return PTR_ERR(task); > > @@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task * > > */ > > status = NFS_PROTO(data->inode)->write_done(task, data); > > if (status != 0) > > - return status; > > + goto out; > > nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); > > > > #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) > > @@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task * > > task->tk_status = -EIO; > > } > > nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res); > > +out: > > + nfs_dec_woutstanding(data->inode); > > return 0; > > } > > > > @@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str > > return nfs_wb_page_priority(inode, page, FLUSH_STABLE); > > } > > > > +int nfs_wb_eager(struct inode *inode) > > +{ > > + struct address_space *mapping = inode->i_mapping; > > + struct writeback_control wbc = { > > + .bdi = mapping->backing_dev_info, > > + .sync_mode = WB_SYNC_NONE, > > + .nr_to_write = LONG_MAX, > > + .range_start = 0, > > + .range_end = LLONG_MAX, > > + }; > > + int ret; > > + > > + ret = nfs_writepages(mapping, &wbc); > > + if (ret < 0) > > + __mark_inode_dirty(inode, I_DIRTY_PAGES); > > + return ret; > > +} > > + > > #ifdef CONFIG_MIGRATION > > int nfs_migrate_page(struct address_space *mapping, struct page *newpage, > > struct page *page) > > @@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void) > > mempool_destroy(nfs_wdata_mempool); > > kmem_cache_destroy(nfs_wdata_cachep); > > } > > - > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h > > --- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/include/linux/nfs_fs.h 2009-11-13 11:36:43.982136105 -0500 > > @@ -166,6 +166,7 @@ struct nfs_inode { > > struct radix_tree_root nfs_page_tree; > > > > unsigned long npages; > > + atomic_t ndirty; > > > > /* Open contexts for shared mmap writes */ > > struct list_head open_files; > > @@ -187,6 +188,11 @@ struct nfs_inode { > > #ifdef CONFIG_NFS_FSCACHE > > struct fscache_cookie *fscache; > > #endif > > + > > + loff_t wrpos; > > + atomic_t writes; > > + wait_queue_head_t writes_wq; > > + > > struct inode vfs_inode; > > }; > > > > @@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru > > * linux/fs/nfs/write.c > > */ > > extern int nfs_congestion_kb; > > +extern int nfs_max_woutstanding; > > extern int nfs_writepage(struct page *page, struct writeback_control *wbc); > > extern int nfs_writepages(struct address_space *, struct writeback_control *); > > extern int nfs_flush_incompatible(struct file *file, struct page *page); > > extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); > > extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); > > +extern void nfs_wait_woutstanding(struct inode *); > > > > /* > > * Try to write back everything synchronously (but check the > > @@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod > > extern int nfs_wb_nocommit(struct inode *inode); > > extern int nfs_wb_page(struct inode *inode, struct page* page); > > extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); > > +extern int nfs_wb_eager(struct inode *inode); > > #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) > > extern int nfs_commit_inode(struct inode *, int); > > extern struct nfs_write_data *nfs_commitdata_alloc(void); > > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c > > --- linux-2.6.32-rc7-orig/mm/page-writeback.c 2009-11-12 19:46:07.000000000 -0500 > > +++ linux-2.6.32-rc7/mm/page-writeback.c 2009-11-18 10:05:22.314373138 -0500 > > @@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a > > * threshold otherwise wait until the disk writes catch > > * up. > > */ > > - if (bdi_nr_reclaimable > bdi_thresh) { > > + if (bdi_nr_reclaimable != 0) { > > writeback_inodes_wbc(&wbc); > > pages_written += write_chunk - wbc.nr_to_write; > > get_dirty_limits(&background_thresh, &dirty_thresh, > > And I think you just broke regular writeback here, allowing for tons of > unneeded writeout of very small chunks. Maybe so. I had originally worked from a 2.6.18 base, where the check was "if (nr_reclaimable)", so I retained that check, because with eager writeback, there should never be that many writeback pages and there is a check above this to break out of the loop if (reclaimable+writeback <= bdi_thresh). But I probably ignored the effect on local disk subsystems. Do you know of any tests that will illustrate what effect this will have? > This really needs to be multiple patches, and a proper changelog > describing why you do things. The above, because my benchmark goes > faster, just isn't sufficient. I can see splitting this into two separate patches, but then that would be confusing, because the nfs-only patch depends on the mm/fs patch, and the mm/fs patch looks odd all by itself (only about 10 lines of diffs), so I thought it more prudent to keep them together. > > Also, I don't think this needs to have a sysctl, it should just work. The sysctl is a *good thing* in that it allows the eager writeback behavior to be tuned and shut off if need be. I can only test the changes on a finite set of systems, so better safe than sorry. > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 19:33 ` Steve Rago @ 2009-12-18 19:41 ` Ingo Molnar 2009-12-18 21:20 ` Steve Rago 2009-12-18 19:44 ` Peter Zijlstra 1 sibling, 1 reply; 96+ messages in thread From: Ingo Molnar @ 2009-12-18 19:41 UTC (permalink / raw) To: Steve Rago Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > Also, I don't think this needs to have a sysctl, it should just work. > > The sysctl is a *good thing* in that it allows the eager writeback behavior > to be tuned and shut off if need be. I can only test the changes on a > finite set of systems, so better safe than sorry. This issue has been settled many years ago and that's not what we do in the Linux kernel. We prefer patches to core code where we are reasonably sure they result in good behavior - and then we fix bugs in the new behavior, if any. (Otherwise odd sysctls would mushroom quickly and the system would become untestable in practice.) Ingo ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 19:41 ` Ingo Molnar @ 2009-12-18 21:20 ` Steve Rago 2009-12-18 22:07 ` Ingo Molnar 2009-12-19 8:08 ` Arjan van de Ven 0 siblings, 2 replies; 96+ messages in thread From: Steve Rago @ 2009-12-18 21:20 UTC (permalink / raw) To: Ingo Molnar Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote: > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > > > Also, I don't think this needs to have a sysctl, it should just work. > > > > The sysctl is a *good thing* in that it allows the eager writeback behavior > > to be tuned and shut off if need be. I can only test the changes on a > > finite set of systems, so better safe than sorry. > > This issue has been settled many years ago and that's not what we do in the > Linux kernel. We prefer patches to core code where we are reasonably sure they > result in good behavior - and then we fix bugs in the new behavior, if any. > > (Otherwise odd sysctls would mushroom quickly and the system would become > untestable in practice.) > > Ingo I don't disagree, but "that's not what we do" hardly provides insight into making the judgment call. In this case, the variety of combinations of NFS server speed, NFS client speed, transmission link speed, client memory size, and server memory size argues for a tunable parameter, because one value probably won't work well in all combinations. Making it change dynamically based on these parameters is more complicated than these circumstances call for, IMHO. Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 21:20 ` Steve Rago @ 2009-12-18 22:07 ` Ingo Molnar 2009-12-18 22:46 ` Steve Rago 2009-12-19 8:08 ` Arjan van de Ven 1 sibling, 1 reply; 96+ messages in thread From: Ingo Molnar @ 2009-12-18 22:07 UTC (permalink / raw) To: Steve Rago Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote: > > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > > > > > Also, I don't think this needs to have a sysctl, it should just work. > > > > > > The sysctl is a *good thing* in that it allows the eager writeback behavior > > > to be tuned and shut off if need be. I can only test the changes on a > > > finite set of systems, so better safe than sorry. > > > > This issue has been settled many years ago and that's not what we do in the > > Linux kernel. We prefer patches to core code where we are reasonably sure they > > result in good behavior - and then we fix bugs in the new behavior, if any. > > > > (Otherwise odd sysctls would mushroom quickly and the system would become > > untestable in practice.) > > > > Ingo > > I don't disagree, but "that's not what we do" hardly provides insight into > making the judgment call. [...] I gave you an example of the problems that arise, see the last sentence above. > [...] In this case, the variety of combinations of NFS server speed, NFS > client speed, transmission link speed, client memory size, and server memory > size argues for a tunable parameter, because one value probably won't work > well in all combinations. Making it change dynamically based on these > parameters is more complicated than these circumstances call for, IMHO. So having crappy tunables is the reason to introduce even more tunables? I think you just gave a good second example of why we dont want sysctls for features like this. Ingo ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 22:07 ` Ingo Molnar @ 2009-12-18 22:46 ` Steve Rago 0 siblings, 0 replies; 96+ messages in thread From: Steve Rago @ 2009-12-18 22:46 UTC (permalink / raw) To: Ingo Molnar Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Fri, 2009-12-18 at 23:07 +0100, Ingo Molnar wrote: > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > > > > On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote: > > > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > > > > > > > Also, I don't think this needs to have a sysctl, it should just work. > > > > > > > > The sysctl is a *good thing* in that it allows the eager writeback behavior > > > > to be tuned and shut off if need be. I can only test the changes on a > > > > finite set of systems, so better safe than sorry. > > > > > > This issue has been settled many years ago and that's not what we do in the > > > Linux kernel. We prefer patches to core code where we are reasonably sure they > > > result in good behavior - and then we fix bugs in the new behavior, if any. > > > > > > (Otherwise odd sysctls would mushroom quickly and the system would become > > > untestable in practice.) > > > > > > Ingo > > > > I don't disagree, but "that's not what we do" hardly provides insight into > > making the judgment call. [...] > > I gave you an example of the problems that arise, see the last sentence above. > > > [...] In this case, the variety of combinations of NFS server speed, NFS > > client speed, transmission link speed, client memory size, and server memory > > size argues for a tunable parameter, because one value probably won't work > > well in all combinations. Making it change dynamically based on these > > parameters is more complicated than these circumstances call for, IMHO. > > So having crappy tunables is the reason to introduce even more tunables? I > think you just gave a good second example of why we dont want sysctls for > features like this. > > Ingo The examples I cited are not tunables. They are characteristics of the systems we use. I can't squeeze more than 1Gb/s out of my gigabit Ethernet connection; I can't make my 2GHz CPU compute any faster; I am limited by these components to the performance I can attain. Writing software that performs well in all combinations, especially to take advantage of the myriad of combinations, is difficult at best. The tunable introduced in the patch is a compromise to writing a much more complicated adaptive algorithm that most likely wouldn't have access to all of the information it needed anyway. Steve > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 21:20 ` Steve Rago 2009-12-18 22:07 ` Ingo Molnar @ 2009-12-19 8:08 ` Arjan van de Ven 2009-12-19 13:37 ` Steve Rago 1 sibling, 1 reply; 96+ messages in thread From: Arjan van de Ven @ 2009-12-19 8:08 UTC (permalink / raw) To: Steve Rago Cc: Ingo Molnar, Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Fri, 18 Dec 2009 16:20:11 -0500 Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > I don't disagree, but "that's not what we do" hardly provides insight > into making the judgment call. In this case, the variety of > combinations of NFS server speed, NFS client speed, transmission link > speed, client memory size, and server memory size argues for a tunable > parameter, because one value probably won't work well in all > combinations. Making it change dynamically based on these parameters > is more complicated than these circumstances call for, IMHO. if you as the expert do not know how to tune this... how is a sysadmin supposed to know better? -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-19 8:08 ` Arjan van de Ven @ 2009-12-19 13:37 ` Steve Rago 0 siblings, 0 replies; 96+ messages in thread From: Steve Rago @ 2009-12-19 13:37 UTC (permalink / raw) To: Arjan van de Ven Cc: Ingo Molnar, Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Sat, 2009-12-19 at 09:08 +0100, Arjan van de Ven wrote: > On Fri, 18 Dec 2009 16:20:11 -0500 > Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote: > > > > > I don't disagree, but "that's not what we do" hardly provides insight > > into making the judgment call. In this case, the variety of > > combinations of NFS server speed, NFS client speed, transmission link > > speed, client memory size, and server memory size argues for a tunable > > parameter, because one value probably won't work well in all > > combinations. Making it change dynamically based on these parameters > > is more complicated than these circumstances call for, IMHO. > > > if you as the expert do not know how to tune this... how is a sysadmin > supposed to know better? > I did not say I didn't know how to tune it. I said you put the tunable parameter in as a compromise to doing something far more complex. You then adjust the value according to various workloads (in this case, iozone or something that more closely resembles your application). The same way I figure out how man NFSD processes to configure; the same way I figure out acceptable values for dirty_ratio and dirty_background_ratio. The code has a reasonably conservative default, and people can adjust it if their circumstances differ such that the default doesn't provide acceptable results. Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-18 19:33 ` Steve Rago 2009-12-18 19:41 ` Ingo Molnar @ 2009-12-18 19:44 ` Peter Zijlstra 1 sibling, 0 replies; 96+ messages in thread From: Peter Zijlstra @ 2009-12-18 19:44 UTC (permalink / raw) To: Steve Rago Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang, jens.axboe On Fri, 2009-12-18 at 14:33 -0500, Steve Rago wrote: > > > +void nfs_wait_woutstanding(struct inode *inode) > > > +{ > > > + if (nfs_max_woutstanding != 0) { > > > + unsigned long background_thresh; > > > + unsigned long dirty_thresh; > > > + long npages; > > > + struct nfs_inode *nfsi = NFS_I(inode); > > > + > > > + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); > > > + npages = global_page_state(NR_FILE_DIRTY) + > > > + global_page_state(NR_UNSTABLE_NFS) + > > > + global_page_state(NR_WRITEBACK); > > > + if (npages >= background_thresh) > > > + wait_event(nfsi->writes_wq, > > > + atomic_read(&nfsi->writes) < nfs_max_woutstanding); > > > + } > > > +} > > > > This looks utterly busted too, why the global state and not the nfs > > client's bdi state? > > > > Also, why create this extra workqueue and not simply use the congestion > > interface that is present? If the congestion stuff doesn't work for you, > > fix that, don't add extra muck like this. > > Pages are a global resource. Once we hit the dirty_threshold, the > system is going to work harder to flush the pages out. This code > prevents the caller from creating more dirty pages in this state, > thereby making matters worse, when eager writeback is enabled. You misunderstand, dirty limits are per BDI, all those npages might be for !NFS traffic, in which case forcing the NFS into sync mode might be the wrong thing to do. The dirty pages are no longer a global resource in the current Linux tree. > This wait queue is used for different purposes than the congestion_wait > interface. Here we are preventing the caller from proceeding if there > are too many NFS writes outstanding for this thread and we are in a > memory pressure state. It has nothing to do with the state of the bdi > congestion. I'm thinking it ought to, congestion is exactly that, when the device gets backed up and need to get moving. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-17 8:17 ` Peter Zijlstra 2009-12-18 19:33 ` Steve Rago @ 2009-12-19 12:20 ` Wu Fengguang 2009-12-19 14:25 ` Steve Rago 1 sibling, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2009-12-19 12:20 UTC (permalink / raw) To: Peter Zijlstra Cc: Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach Hi Steve, // I should really read the NFS code, but maybe you can help us better // understand the problem :) On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > Eager Writeback for NFS Clients > > ------------------------------- > > Prevent applications that write large sequential streams of data (like backup, for example) > > from entering into a memory pressure state, which degrades performance by falling back to > > synchronous operations (both synchronous writes and additional commits). What exactly is the "memory pressure state" condition? What's the code to do the "synchronous writes and additional commits" and maybe how they are triggered? > > This is accomplished by preventing the client application from > > dirtying pages faster than they can be written to the server: > > clients write pages eagerly instead of lazily. We already have the balance_dirty_pages() based global throttling. So what makes the performance difference in your proposed "per-inode" throttling? balance_dirty_pages() does have much larger threshold than yours. > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables > > the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be > > in flight for a given file. This is used to block the application from dirtying more pages > > until the writes are complete. What if we do heuristic write-behind for sequential NFS writes? Another related proposal from Peter Staubach is to start async writeback (without the throttle in your proposal) when one inode have enough pages dirtied: Another approach that I suggested was to keep track of the number of pages which are dirty on a per-inode basis. When enough pages are dirty to fill an over the wire transfer, then schedule an asynchronous write to transmit that data to the server. This ties in with support to ensure that the server/network is not completely overwhelmed by the client by flow controlling the writing application to better match the bandwidth and latencies of the network and server. With this support, the NFS client tends not to fill memory with dirty pages and thus, does not depend upon the other parts of the system to flush these pages. Can the above alternatives fix the same problem? (or perhaps, is the per-inode throttling really necessary?) > > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach. For > > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323. > > > > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32. > > > > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar). > > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet. > > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential > > workloads (where "large" is defined to be greater than the memory size on the client). These are impressive numbers. I wonder what would be the minimal patch (just hacking it to fast, without all the aux bits)? Is it this chunk to call nfs_wb_eager()? > > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio > > nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); > > result = generic_file_aio_write(iocb, iov, nr_segs, pos); > > /* Return error values for O_SYNC and IS_SYNC() */ > > - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { > > - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); > > - if (err < 0) > > - result = err; > > + if (result >= 0) { > > + if (nfs_need_sync_write(iocb->ki_filp, inode)) { > > + int err; > > + > > + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), > > + inode); > > + if (err < 0) > > + result = err; > > + } else if (nfs_max_woutstanding != 0 && > > + nfs_is_seqwrite(inode, pos) && > > + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) { > > + nfs_wb_eager(inode); > > + } > > + if (result > 0) > > + nfsi->wrpos = pos + result; > > } Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-19 12:20 ` Wu Fengguang @ 2009-12-19 14:25 ` Steve Rago 2009-12-22 1:59 ` Wu Fengguang 2009-12-22 12:25 ` Jan Kara 0 siblings, 2 replies; 96+ messages in thread From: Steve Rago @ 2009-12-19 14:25 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote: > Hi Steve, > > // I should really read the NFS code, but maybe you can help us better > // understand the problem :) > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > > Eager Writeback for NFS Clients > > > ------------------------------- > > > Prevent applications that write large sequential streams of data (like backup, for example) > > > from entering into a memory pressure state, which degrades performance by falling back to > > > synchronous operations (both synchronous writes and additional commits). > > What exactly is the "memory pressure state" condition? What's the > code to do the "synchronous writes and additional commits" and maybe > how they are triggered? Memory pressure occurs when most of the client pages have been dirtied by an application (think backup server writing multi-gigabyte files that exceed the size of main memory). The system works harder to be able to free dirty pages so that they can be reused. For a local file system, this means writing the pages to disk. For NFS, however, the writes leave the pages in an "unstable" state until the server responds to a commit request. Generally speaking, commit processing is far more expensive than write processing on the server; both are done with the inode locked, but since the commit takes so long, all writes are blocked, which stalls the pipeline. Flushing is generated from the flush kernel threads (nee pdflush) and from the applications themselves (balance_dirty_pages), as well as periodic sync (kupdate style). This is roughly controlled by adjusting dirty_ratio and dirty_background_ratio (along with dirty_expire_centisecs and dirty_writeback_centisecs). In addition, when the client system *really* needs a page deep down in the page allocator, it can generate a synchronous write request of individual pages. This is just about as expensive as a commit, roughly speaking, again stalling the pipeline. > > > > This is accomplished by preventing the client application from > > > dirtying pages faster than they can be written to the server: > > > clients write pages eagerly instead of lazily. > > We already have the balance_dirty_pages() based global throttling. > So what makes the performance difference in your proposed "per-inode" throttling? > balance_dirty_pages() does have much larger threshold than yours. I originally spent several months playing with the balance_dirty_pages algorithm. The main drawback is that it affects more than the inodes that the caller is writing and that the control of what to do is too coarse. My final changes (which worked well for 1Gb connections) were more heuristic than the changes in the patch -- I basically had to come up with alternate ways to write pages without generating commits on inodes. Doing this was distasteful, as I was adjusting generic system behavior for an NFS-only problem. Then a colleague found Peter Staubach's patch, which worked just as well in less code, and isolated the change to the NFS component, which is where it belongs. > > > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables > > > the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be > > > in flight for a given file. This is used to block the application from dirtying more pages > > > until the writes are complete. > > What if we do heuristic write-behind for sequential NFS writes? Part of the patch does implement a heuristic write-behind. See where nfs_wb_eager() is called. > > Another related proposal from Peter Staubach is to start async writeback > (without the throttle in your proposal) when one inode have enough pages > dirtied: > > Another approach that I suggested was to keep track of the > number of pages which are dirty on a per-inode basis. When > enough pages are dirty to fill an over the wire transfer, > then schedule an asynchronous write to transmit that data to > the server. This ties in with support to ensure that the > server/network is not completely overwhelmed by the client > by flow controlling the writing application to better match > the bandwidth and latencies of the network and server. > With this support, the NFS client tends not to fill memory > with dirty pages and thus, does not depend upon the other > parts of the system to flush these pages. > > Can the above alternatives fix the same problem? (or perhaps, is the > per-inode throttling really necessary?) This alternative *is contained in* the patch (as this is mostly Peter's code anyway; all I've done is the analysis and porting). The throttling is necessary to prevent the client from continuing to fill all of its memory with dirty pages, which it can always do faster than it can write pages to the server. > > > > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach. For > > > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323. > > > > > > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32. > > > > > > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar). > > > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet. > > > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential > > > workloads (where "large" is defined to be greater than the memory size on the client). > > These are impressive numbers. I wonder what would be the minimal patch > (just hacking it to fast, without all the aux bits)? Is it this chunk > to call nfs_wb_eager()? The first half is the same as before, with different indentation. The last half is indeed the heuristic to call nfs_wb_eager() to invoke asynchronous write-behind. With these changes, the number of NFS commit messages drops from a few thousands to tens when writing a 32GB file over NFS. This is mainly because the server is writing dirty pages from its cache in the background, so when a commit comes along, it has less work to do (as opposed to writing all of the pages on demand and then waiting for the commit). I have a second set of changes, which I have not yet submitted, that removes these commits, but it extends the protocol (in a backward-compatible way), which will probably be a harder sell. Steve > > > > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio > > > nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); > > > result = generic_file_aio_write(iocb, iov, nr_segs, pos); > > > /* Return error values for O_SYNC and IS_SYNC() */ > > > - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { > > > - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); > > > - if (err < 0) > > > - result = err; > > > + if (result >= 0) { > > > + if (nfs_need_sync_write(iocb->ki_filp, inode)) { > > > + int err; > > > + > > > + err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), > > > + inode); > > > + if (err < 0) > > > + result = err; > > > + } else if (nfs_max_woutstanding != 0 && > > > + nfs_is_seqwrite(inode, pos) && > > > + atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) { > > > + nfs_wb_eager(inode); > > > + } > > > + if (result > 0) > > > + nfsi->wrpos = pos + result; > > > } > > Thanks, > Fengguang > ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-19 14:25 ` Steve Rago @ 2009-12-22 1:59 ` Wu Fengguang 2009-12-22 12:35 ` Jan Kara ` (3 more replies) 2009-12-22 12:25 ` Jan Kara 1 sibling, 4 replies; 96+ messages in thread From: Wu Fengguang @ 2009-12-22 1:59 UTC (permalink / raw) To: Steve Rago Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel Steve, On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote: > > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote: > > > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > > > Eager Writeback for NFS Clients > > > > ------------------------------- > > > > Prevent applications that write large sequential streams of data (like backup, for example) > > > > from entering into a memory pressure state, which degrades performance by falling back to > > > > synchronous operations (both synchronous writes and additional commits). > > > > What exactly is the "memory pressure state" condition? What's the > > code to do the "synchronous writes and additional commits" and maybe > > how they are triggered? > > Memory pressure occurs when most of the client pages have been dirtied > by an application (think backup server writing multi-gigabyte files that > exceed the size of main memory). The system works harder to be able to > free dirty pages so that they can be reused. For a local file system, > this means writing the pages to disk. For NFS, however, the writes > leave the pages in an "unstable" state until the server responds to a > commit request. Generally speaking, commit processing is far more > expensive than write processing on the server; both are done with the > inode locked, but since the commit takes so long, all writes are > blocked, which stalls the pipeline. Let me try reiterate the problem with code, please correct me if I'm wrong. 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS will set the flag for any pages written -- why this trick? To guarantee the call of nfs_commit_inode()? Which unfortunately turns almost every server side NFS write into sync writes.. writeback_single_inode: do_writepages nfs_writepages nfs_writepage ----[short time later]---> nfs_writeback_release* nfs_mark_request_commit __mark_inode_dirty(I_DIRTY_DATASYNC); if (I_DIRTY_SYNC || I_DIRTY_DATASYNC) <---- so this will be true for most time write_inode nfs_write_inode nfs_commit_inode 2) NFS commit stops pipeline because it sleep&wait inside i_mutex, which blocks all other NFSDs trying to write/writeback the inode. nfsd_sync: take i_mutex filemap_fdatawrite filemap_fdatawait drop i_mutex If filemap_fdatawait() can be moved out of i_mutex (or just remove the lock), we solve the root problem: nfsd_sync: [take i_mutex] filemap_fdatawrite => can also be blocked, but less a problem [drop i_mutex] filemap_fdatawait Maybe it's a dumb question, but what's the purpose of i_mutex here? For correctness or to prevent livelock? I can imagine some livelock problem here (current implementation can easily wait for extra pages), however not too hard to fix. The proposed patch essentially takes two actions in nfs_file_write() - to start writeback when the per-file nr_dirty goes high without committing - to throttle dirtying when the per-file nr_writeback goes high I guess this effectively prevents pdflush from kicking in with its bad committing behavior In general it's reasonable to keep NFS per-file nr_dirty low, however questionable to do per-file nr_writeback throttling. This does not work well with the global limits - eg. when there are many dirty files, the summed-up nr_writeback will still grow out of control. And it's more likely to impact user visible responsiveness than a global limit. But my opinion can be biased -- me have a patch to do global NFS nr_writeback limit ;) Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 1:59 ` Wu Fengguang @ 2009-12-22 12:35 ` Jan Kara [not found] ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> 2009-12-24 1:26 ` Wu Fengguang 2009-12-22 13:01 ` Martin Knoblauch ` (2 subsequent siblings) 3 siblings, 2 replies; 96+ messages in thread From: Jan Kara @ 2009-12-22 12:35 UTC (permalink / raw) To: Wu Fengguang Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex, > which blocks all other NFSDs trying to write/writeback the inode. > > nfsd_sync: > take i_mutex > filemap_fdatawrite > filemap_fdatawait > drop i_mutex I believe this is unrelated to the problem Steve is trying to solve. When we get to doing sync writes the performance is busted so we better shouldn't get to that (unless user asked for that of course). > If filemap_fdatawait() can be moved out of i_mutex (or just remove > the lock), we solve the root problem: > > nfsd_sync: > [take i_mutex] > filemap_fdatawrite => can also be blocked, but less a problem > [drop i_mutex] > filemap_fdatawait > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > For correctness or to prevent livelock? I can imagine some livelock > problem here (current implementation can easily wait for extra > pages), however not too hard to fix. Generally, most filesystems take i_mutex during fsync to a) avoid all sorts of livelocking problems b) serialize fsyncs for one inode (mostly for simplicity) I don't see what advantage would it bring that we get rid of i_mutex for fdatawait - only that maybe writers could proceed while we are waiting but is that really the problem? Honza -- Jan Kara <jack@suse.cz> SuSE CR Labs ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> @ 2009-12-23 8:43 ` Christoph Hellwig 2009-12-23 13:32 ` Jan Kara 0 siblings, 1 reply; 96+ messages in thread From: Christoph Hellwig @ 2009-12-23 8:43 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote: > > nfsd_sync: > > [take i_mutex] > > filemap_fdatawrite => can also be blocked, but less a problem > > [drop i_mutex] > > filemap_fdatawait > > > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > > For correctness or to prevent livelock? I can imagine some livelock > > problem here (current implementation can easily wait for extra > > pages), however not too hard to fix. > Generally, most filesystems take i_mutex during fsync to > a) avoid all sorts of livelocking problems > b) serialize fsyncs for one inode (mostly for simplicity) > I don't see what advantage would it bring that we get rid of i_mutex > for fdatawait - only that maybe writers could proceed while we are > waiting but is that really the problem? It would match what we do in vfs_fsync for the non-nfsd path, so it's a no-brainer to do it. In fact I did switch it over to vfs_fsync a while ago but that go reverted because it caused deadlocks for nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to check the archives why). Here's a RFC patch to make some more sense of the fsync callers in nfsd, including fixing up the data write/wait calling conventions to match the regular fsync path (which might make this a -stable candidate): Index: linux-2.6/fs/nfsd/vfs.c =================================================================== --- linux-2.6.orig/fs/nfsd/vfs.c 2009-12-23 09:32:45.693170043 +0100 +++ linux-2.6/fs/nfsd/vfs.c 2009-12-23 09:39:47.627170082 +0100 @@ -769,45 +769,27 @@ nfsd_close(struct file *filp) } /* - * Sync a file - * As this calls fsync (not fdatasync) there is no need for a write_inode - * after it. + * Sync a directory to disk. + * + * This is odd compared to all other fsync callers because we + * + * a) do not have a file struct available + * b) expect to have i_mutex already held by the caller */ -static inline int nfsd_dosync(struct file *filp, struct dentry *dp, - const struct file_operations *fop) +int +nfsd_sync_dir(struct dentry *dentry) { - struct inode *inode = dp->d_inode; - int (*fsync) (struct file *, struct dentry *, int); + struct inode *inode = dentry->d_inode; int err; - err = filemap_fdatawrite(inode->i_mapping); - if (err == 0 && fop && (fsync = fop->fsync)) - err = fsync(filp, dp, 0); - if (err == 0) - err = filemap_fdatawait(inode->i_mapping); + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + err = filemap_write_and_wait(inode->i_mapping); + if (err == 0 && inode->i_fop->fsync) + err = inode->i_fop->fsync(NULL, dentry, 0); return err; } -static int -nfsd_sync(struct file *filp) -{ - int err; - struct inode *inode = filp->f_path.dentry->d_inode; - dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); - mutex_lock(&inode->i_mutex); - err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); - mutex_unlock(&inode->i_mutex); - - return err; -} - -int -nfsd_sync_dir(struct dentry *dp) -{ - return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); -} - /* * Obtain the readahead parameters for the file * specified by (dev, ino). @@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = nfsd_sync(file); + err = vfs_fsync(file, file->f_path.dentry, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru return err; if (EX_ISSYNC(fhp->fh_export)) { if (file->f_op && file->f_op->fsync) { - err = nfserrno(nfsd_sync(file)); + err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0)); } else { err = nfserr_notsupp; } ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 8:43 ` Christoph Hellwig @ 2009-12-23 13:32 ` Jan Kara [not found] ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2009-12-23 13:32 UTC (permalink / raw) To: Christoph Hellwig Cc: Jan Kara, Wu Fengguang, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Wed 23-12-09 03:43:02, Christoph Hellwig wrote: > On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote: > > > nfsd_sync: > > > [take i_mutex] > > > filemap_fdatawrite => can also be blocked, but less a problem > > > [drop i_mutex] > > > filemap_fdatawait > > > > > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > > > For correctness or to prevent livelock? I can imagine some livelock > > > problem here (current implementation can easily wait for extra > > > pages), however not too hard to fix. > > Generally, most filesystems take i_mutex during fsync to > > a) avoid all sorts of livelocking problems > > b) serialize fsyncs for one inode (mostly for simplicity) > > I don't see what advantage would it bring that we get rid of i_mutex > > for fdatawait - only that maybe writers could proceed while we are > > waiting but is that really the problem? > > It would match what we do in vfs_fsync for the non-nfsd path, so it's > a no-brainer to do it. In fact I did switch it over to vfs_fsync a > while ago but that go reverted because it caused deadlocks for > nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to > check the archives why). > > Here's a RFC patch to make some more sense of the fsync callers in nfsd, > including fixing up the data write/wait calling conventions to match the > regular fsync path (which might make this a -stable candidate): The patch looks good to me from general soundness point of view :). Someone with more NFS knowledge should tell whether dropping i_mutex for fdatawrite_and_wait is fine for NFS. Honza > Index: linux-2.6/fs/nfsd/vfs.c > =================================================================== > --- linux-2.6.orig/fs/nfsd/vfs.c 2009-12-23 09:32:45.693170043 +0100 > +++ linux-2.6/fs/nfsd/vfs.c 2009-12-23 09:39:47.627170082 +0100 > @@ -769,45 +769,27 @@ nfsd_close(struct file *filp) > } > > /* > - * Sync a file > - * As this calls fsync (not fdatasync) there is no need for a write_inode > - * after it. > + * Sync a directory to disk. > + * > + * This is odd compared to all other fsync callers because we > + * > + * a) do not have a file struct available > + * b) expect to have i_mutex already held by the caller > */ > -static inline int nfsd_dosync(struct file *filp, struct dentry *dp, > - const struct file_operations *fop) > +int > +nfsd_sync_dir(struct dentry *dentry) > { > - struct inode *inode = dp->d_inode; > - int (*fsync) (struct file *, struct dentry *, int); > + struct inode *inode = dentry->d_inode; > int err; > > - err = filemap_fdatawrite(inode->i_mapping); > - if (err == 0 && fop && (fsync = fop->fsync)) > - err = fsync(filp, dp, 0); > - if (err == 0) > - err = filemap_fdatawait(inode->i_mapping); > + WARN_ON(!mutex_is_locked(&inode->i_mutex)); > > + err = filemap_write_and_wait(inode->i_mapping); > + if (err == 0 && inode->i_fop->fsync) > + err = inode->i_fop->fsync(NULL, dentry, 0); > return err; > } > > -static int > -nfsd_sync(struct file *filp) > -{ > - int err; > - struct inode *inode = filp->f_path.dentry->d_inode; > - dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); > - mutex_lock(&inode->i_mutex); > - err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); > - mutex_unlock(&inode->i_mutex); > - > - return err; > -} > - > -int > -nfsd_sync_dir(struct dentry *dp) > -{ > - return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); > -} > - > /* > * Obtain the readahead parameters for the file > * specified by (dev, ino). > @@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st > > if (inode->i_state & I_DIRTY) { > dprintk("nfsd: write sync %d\n", task_pid_nr(current)); > - err = nfsd_sync(file); > + err = vfs_fsync(file, file->f_path.dentry, 0); > } > last_ino = inode->i_ino; > last_dev = inode->i_sb->s_dev; > @@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru > return err; > if (EX_ISSYNC(fhp->fh_export)) { > if (file->f_op && file->f_op->fsync) { > - err = nfserrno(nfsd_sync(file)); > + err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0)); > } else { > err = nfserr_notsupp; > } -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> @ 2009-12-24 5:25 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2009-12-24 5:25 UTC (permalink / raw) To: Jan Kara Cc: Christoph Hellwig, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, Dec 23, 2009 at 09:32:44PM +0800, Jan Kara wrote: > On Wed 23-12-09 03:43:02, Christoph Hellwig wrote: > > On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote: > > > > nfsd_sync: > > > > [take i_mutex] > > > > filemap_fdatawrite => can also be blocked, but less a problem > > > > [drop i_mutex] > > > > filemap_fdatawait > > > > > > > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > > > > For correctness or to prevent livelock? I can imagine some livelock > > > > problem here (current implementation can easily wait for extra > > > > pages), however not too hard to fix. > > > Generally, most filesystems take i_mutex during fsync to > > > a) avoid all sorts of livelocking problems > > > b) serialize fsyncs for one inode (mostly for simplicity) > > > I don't see what advantage would it bring that we get rid of i_mutex > > > for fdatawait - only that maybe writers could proceed while we are > > > waiting but is that really the problem? > > > > It would match what we do in vfs_fsync for the non-nfsd path, so it's > > a no-brainer to do it. In fact I did switch it over to vfs_fsync a > > while ago but that go reverted because it caused deadlocks for > > nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to > > check the archives why). > > > > Here's a RFC patch to make some more sense of the fsync callers in nfsd, > > including fixing up the data write/wait calling conventions to match the > > regular fsync path (which might make this a -stable candidate): > The patch looks good to me from general soundness point of view :). > Someone with more NFS knowledge should tell whether dropping i_mutex for > fdatawrite_and_wait is fine for NFS. I believe it's safe to drop i_mutex for fdatawrite_and_wait(). Because NFS 1) client: collect all unstable pages (which server ACKed that have reach its page cache) 2) client: send COMMIT 3) server: fdatawrite_and_wait(), which makes sure pages in 1) get cleaned 4) client: put all pages collected in 1) to clean state So there's no need to take i_mutex to prevent concurrent write/commits. If someone else concurrently truncate and then extend i_size, the NFS verf will be changed and thus client will resend the pages? (whether it should overwrite the pages is another problem..) Thanks, Fengguang > > > Index: linux-2.6/fs/nfsd/vfs.c > > =================================================================== > > --- linux-2.6.orig/fs/nfsd/vfs.c 2009-12-23 09:32:45.693170043 +0100 > > +++ linux-2.6/fs/nfsd/vfs.c 2009-12-23 09:39:47.627170082 +0100 > > @@ -769,45 +769,27 @@ nfsd_close(struct file *filp) > > } > > > > /* > > - * Sync a file > > - * As this calls fsync (not fdatasync) there is no need for a write_inode > > - * after it. > > + * Sync a directory to disk. > > + * > > + * This is odd compared to all other fsync callers because we > > + * > > + * a) do not have a file struct available > > + * b) expect to have i_mutex already held by the caller > > */ > > -static inline int nfsd_dosync(struct file *filp, struct dentry *dp, > > - const struct file_operations *fop) > > +int > > +nfsd_sync_dir(struct dentry *dentry) > > { > > - struct inode *inode = dp->d_inode; > > - int (*fsync) (struct file *, struct dentry *, int); > > + struct inode *inode = dentry->d_inode; > > int err; > > > > - err = filemap_fdatawrite(inode->i_mapping); > > - if (err == 0 && fop && (fsync = fop->fsync)) > > - err = fsync(filp, dp, 0); > > - if (err == 0) > > - err = filemap_fdatawait(inode->i_mapping); > > + WARN_ON(!mutex_is_locked(&inode->i_mutex)); > > > > + err = filemap_write_and_wait(inode->i_mapping); > > + if (err == 0 && inode->i_fop->fsync) > > + err = inode->i_fop->fsync(NULL, dentry, 0); > > return err; > > } > > > > -static int > > -nfsd_sync(struct file *filp) > > -{ > > - int err; > > - struct inode *inode = filp->f_path.dentry->d_inode; > > - dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); > > - mutex_lock(&inode->i_mutex); > > - err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); > > - mutex_unlock(&inode->i_mutex); > > - > > - return err; > > -} > > - > > -int > > -nfsd_sync_dir(struct dentry *dp) > > -{ > > - return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); > > -} > > - > > /* > > * Obtain the readahead parameters for the file > > * specified by (dev, ino). > > @@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st > > > > if (inode->i_state & I_DIRTY) { > > dprintk("nfsd: write sync %d\n", task_pid_nr(current)); > > - err = nfsd_sync(file); > > + err = vfs_fsync(file, file->f_path.dentry, 0); > > } > > last_ino = inode->i_ino; > > last_dev = inode->i_sb->s_dev; > > @@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru > > return err; > > if (EX_ISSYNC(fhp->fh_export)) { > > if (file->f_op && file->f_op->fsync) { > > - err = nfserrno(nfsd_sync(file)); > > + err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0)); > > } else { > > err = nfserr_notsupp; > > } > -- > Jan Kara <jack@suse.cz> > SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 12:35 ` Jan Kara [not found] ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> @ 2009-12-24 1:26 ` Wu Fengguang 1 sibling, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2009-12-24 1:26 UTC (permalink / raw) To: Jan Kara Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Tue, Dec 22, 2009 at 08:35:39PM +0800, Jan Kara wrote: > > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex, > > which blocks all other NFSDs trying to write/writeback the inode. > > > > nfsd_sync: > > take i_mutex > > filemap_fdatawrite > > filemap_fdatawait > > drop i_mutex > I believe this is unrelated to the problem Steve is trying to solve. > When we get to doing sync writes the performance is busted so we better > shouldn't get to that (unless user asked for that of course). Yes, first priority is always to reduce the COMMITs and the number of writeback pages they submitted under WB_SYNC_ALL. And I guess the "increase write chunk beyond 128MB" patches can serve it well. The i_mutex should impact NFS write performance for single big copy in this way: pdflush submits many (4MB write, 1 commit) pairs, because the write and commit each will take i_mutex, it effectively limits the server side io queue depth to <=4MB: the next 4MB dirty data won't reach page cache until the previous 4MB is completely synced to disk. There are two kinds of inefficiency here: - the small queue depth - the interleaved use of CPU/DISK: loop { write 4MB => normally only CPU writeback 4MB => mostly disk } When writing many small dirty files _plus_ one big file, there will still be interleaved write/writeback: the 4MB write will be broken into 8 NFS writes with the default wsize=524288. So there may be one nfsd doing COMMIT, another 7 nfsd waiting for the big file's i_mutex. All 8 nfsd are "busy" and pipeline is destroyed. Just a possibility. > > If filemap_fdatawait() can be moved out of i_mutex (or just remove > > the lock), we solve the root problem: > > > > nfsd_sync: > > [take i_mutex] > > filemap_fdatawrite => can also be blocked, but less a problem > > [drop i_mutex] > > filemap_fdatawait > > > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > > For correctness or to prevent livelock? I can imagine some livelock > > problem here (current implementation can easily wait for extra > > pages), however not too hard to fix. > Generally, most filesystems take i_mutex during fsync to > a) avoid all sorts of livelocking problems > b) serialize fsyncs for one inode (mostly for simplicity) > I don't see what advantage would it bring that we get rid of i_mutex > for fdatawait - only that maybe writers could proceed while we are > waiting but is that really the problem? The i_mutex at least has some performance impact. Another one would be the WB_SYNC_ALL. All are related to the COMMIT/sync write behavior. Are there some other _direct_ causes? Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 1:59 ` Wu Fengguang 2009-12-22 12:35 ` Jan Kara @ 2009-12-22 13:01 ` Martin Knoblauch [not found] ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org> 2009-12-22 16:41 ` Steve Rago 2009-12-23 14:21 ` Trond Myklebust 3 siblings, 1 reply; 96+ messages in thread From: Martin Knoblauch @ 2009-12-22 13:01 UTC (permalink / raw) To: Wu Fengguang, Steve Rago Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe ----- Original Message ---- > From: Wu Fengguang <fengguang.wu@intel.com> > To: Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> > Cc: Peter Zijlstra <peterz@infradead.org>; "linux-nfs@vger.kernel.org" <linux-nfs@vger.kernel.org>; "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>; "Trond.Myklebust@netapp.com" <Trond.Myklebust@netapp.com>; jens.axboe <jens.axboe@oracle.com> > Sent: Tue, December 22, 2009 2:59:07 AM > Subject: Re: [PATCH] improve the performance of large sequential write NFS workloads > [big snip] > > In general it's reasonable to keep NFS per-file nr_dirty low, however > questionable to do per-file nr_writeback throttling. This does not > work well with the global limits - eg. when there are many dirty > files, the summed-up nr_writeback will still grow out of control. > And it's more likely to impact user visible responsiveness than > a global limit. But my opinion can be biased -- me have a patch to > do global NFS nr_writeback limit ;) > is that "NFS: introduce writeback wait queue", which you sent me a while ago and I did not test until now :-( ? Cheers Martin ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org> @ 2009-12-24 1:46 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2009-12-24 1:46 UTC (permalink / raw) To: Martin Knoblauch Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe Hi Martin, On Tue, Dec 22, 2009 at 09:01:46PM +0800, Martin Knoblauch wrote: > ----- Original Message ---- > > > From: Wu Fengguang <fengguang.wu@intel.com> > > To: Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> > > Cc: Peter Zijlstra <peterz@infradead.org>; "linux-nfs@vger.kernel.org" <linux-nfs@vger.kernel.org>; "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>; "Trond.Myklebust@netapp.com" <Trond.Myklebust@netapp.com>; jens.axboe <jens.axboe@oracle.com> > > Sent: Tue, December 22, 2009 2:59:07 AM > > Subject: Re: [PATCH] improve the performance of large sequential write NFS workloads > > > > [big snip] > > > > > In general it's reasonable to keep NFS per-file nr_dirty low, however > > questionable to do per-file nr_writeback throttling. This does not > > work well with the global limits - eg. when there are many dirty > > files, the summed-up nr_writeback will still grow out of control. > > And it's more likely to impact user visible responsiveness than > > a global limit. But my opinion can be biased -- me have a patch to > > do global NFS nr_writeback limit ;) > > > > > is that "NFS: introduce writeback wait queue", which you sent me a while ago and I did not test until now :-( ? Yes it is - I've been puzzled by some bumpy NFS write problem, and the information in this thread seem to be helpful to understand it :) Thanks, Fengguang --- NFS: introduce writeback wait queue The generic writeback routines are departing from congestion_wait() in preferance of get_request_wait(), aka. waiting on the block queues. Introduce the missing writeback wait queue for NFS, otherwise its writeback pages may grow out of control. In perticular, balance_dirty_pages() will exit after it pushes write_chunk pages into the PG_writeback page pool, _OR_ when the background writeback work quits. The latter is new behavior, and could not only quit (normally) after below background threshold, but also quit when it finds _zero_ dirty pages to write. The latter case gives rise to the number of PG_writeback pages if it is not explicitly limited. CC: Jens Axboe <jens.axboe@oracle.com> CC: Chris Mason <chris.mason@oracle.com> CC: Peter Zijlstra <a.p.zijlstra@chello.nl> CC: Peter Staubach <staubach@redhat.com> CC: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> --- The wait time and network throughput varies a lot! this is a major problem. This means nfs_end_page_writeback() is not called smoothly over time, even when there are plenty of PG_writeback pages on the client side. [ 397.828509] write_bandwidth: comm=nfsiod pages=192 time=16ms [ 397.850976] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 403.065244] write_bandwidth: comm=nfsiod pages=192 time=5212ms [ 403.549134] write_bandwidth: comm=nfsiod pages=1536 time=144ms [ 403.570717] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 403.595749] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 403.622171] write_bandwidth: comm=nfsiod pages=192 time=24ms [ 403.651779] write_bandwidth: comm=nfsiod pages=192 time=28ms [ 403.680543] write_bandwidth: comm=nfsiod pages=192 time=24ms [ 403.712572] write_bandwidth: comm=nfsiod pages=192 time=28ms [ 403.751552] write_bandwidth: comm=nfsiod pages=192 time=36ms [ 403.785979] write_bandwidth: comm=nfsiod pages=192 time=28ms [ 403.823995] write_bandwidth: comm=nfsiod pages=192 time=36ms [ 403.858970] write_bandwidth: comm=nfsiod pages=192 time=32ms [ 403.880786] write_bandwidth: comm=nfsiod pages=192 time=16ms [ 403.902732] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 403.925925] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 403.952044] write_bandwidth: comm=nfsiod pages=258 time=24ms [ 403.974006] write_bandwidth: comm=nfsiod pages=192 time=16ms [ 403.995989] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 405.031049] write_bandwidth: comm=nfsiod pages=192 time=1032ms [ 405.257635] write_bandwidth: comm=nfsiod pages=1536 time=192ms [ 405.279069] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 405.300843] write_bandwidth: comm=nfsiod pages=192 time=16ms [ 405.326031] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 405.350843] write_bandwidth: comm=nfsiod pages=192 time=24ms [ 405.375160] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 409.331015] write_bandwidth: comm=nfsiod pages=192 time=3952ms [ 409.587928] write_bandwidth: comm=nfsiod pages=1536 time=152ms [ 409.610068] write_bandwidth: comm=nfsiod pages=192 time=20ms [ 409.635736] write_bandwidth: comm=nfsiod pages=192 time=24ms # vmmon -d 1 nr_writeback nr_dirty nr_unstable nr_writeback nr_dirty nr_unstable 11227 41463 38044 11227 41463 38044 11227 41463 38044 11227 41463 38044 11045 53987 6490 11033 53120 8145 11195 52143 10886 11211 52144 10913 11211 52144 10913 11211 52144 10913 11056 56887 3876 11062 55298 8155 11214 54485 9838 11225 54461 9852 11225 54461 9852 11225 54461 4582 22342 35535 7823 ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- usr sys idl wai hiq siq| read writ| recv send| in out | int csw 0 0 9 92 0 0| 0 0 | 66B 306B| 0 0 |1003 377 0 1 39 60 0 1| 0 0 | 90k 1361k| 0 0 |1765 1599 0 15 12 43 0 31| 0 0 |2292k 34M| 0 0 | 12k 16k 0 0 16 84 0 0| 0 0 | 132B 306B| 0 0 |1003 376 0 0 43 57 0 0| 0 0 | 66B 306B| 0 0 |1004 376 0 7 25 55 0 13| 0 0 |1202k 18M| 0 0 |7331 8921 0 8 21 55 0 15| 0 0 |1195k 18M| 0 0 |5382 6579 0 0 38 62 0 0| 0 0 | 66B 306B| 0 0 |1002 371 0 0 33 67 0 0| 0 0 | 66B 306B| 0 0 |1003 376 0 14 20 41 0 24| 0 0 |1621k 24M| 0 0 |8549 10k 0 5 31 55 0 9| 0 0 | 769k 11M| 0 0 |4444 5180 0 0 18 82 0 0| 0 0 | 66B 568B| 0 0 |1004 377 0 1 41 54 0 3| 0 0 | 184k 2777k| 0 0 |2609 2619 1 13 22 43 0 22| 0 0 |1572k 23M| 0 0 |8138 10k 0 11 9 59 0 20| 0 0 |1861k 27M| 0 0 |9576 13k 0 5 23 66 0 5| 0 0 | 540k 8122k| 0 0 |2816 2885 fs/nfs/client.c | 2 fs/nfs/write.c | 92 +++++++++++++++++++++++++++++++----- include/linux/nfs_fs_sb.h | 1 3 files changed, 84 insertions(+), 11 deletions(-) --- linux.orig/fs/nfs/write.c 2009-11-06 09:52:23.000000000 +0800 +++ linux/fs/nfs/write.c 2009-11-06 09:52:27.000000000 +0800 @@ -187,11 +187,65 @@ static int wb_priority(struct writeback_ * NFS congestion control */ +#define NFS_WAIT_PAGES (1024L >> (PAGE_CACHE_SHIFT - 10)) int nfs_congestion_kb; -#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) -#define NFS_CONGESTION_OFF_THRESH \ - (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) +/* + * SYNC requests will block on (2*limit) and wakeup on (2*limit-NFS_WAIT_PAGES) + * ASYNC requests will block on (limit) and wakeup on (limit - NFS_WAIT_PAGES) + * In this way SYNC writes will never be blocked by ASYNC ones. + */ + +static void nfs_set_congested(long nr, long limit, + struct backing_dev_info *bdi) +{ + if (nr > limit && !test_bit(BDI_async_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_ASYNC); + else if (nr > 2 * limit && !test_bit(BDI_sync_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_SYNC); +} + +static void nfs_wait_contested(int is_sync, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + int waitbit = is_sync ? BDI_sync_congested : BDI_async_congested; + DEFINE_WAIT(wait); + + if (!test_bit(waitbit, &bdi->state)) + return; + + for (;;) { + prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(waitbit, &bdi->state)) + break; + + io_schedule(); + } + finish_wait(&wqh[is_sync], &wait); +} + +static void nfs_wakeup_congested(long nr, long limit, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + if (nr < 2*limit - min(limit/8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_sync_congested, &bdi->state)) { + clear_bdi_congested(bdi, BLK_RW_SYNC); + smp_mb__after_clear_bit(); + } + if (waitqueue_active(&wqh[BLK_RW_SYNC])) + wake_up(&wqh[BLK_RW_SYNC]); + } + if (nr < limit - min(limit/8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_async_congested, &bdi->state)) { + clear_bdi_congested(bdi, BLK_RW_ASYNC); + smp_mb__after_clear_bit(); + } + if (waitqueue_active(&wqh[BLK_RW_ASYNC])) + wake_up(&wqh[BLK_RW_ASYNC]); + } +} static int nfs_set_page_writeback(struct page *page) { @@ -201,11 +255,9 @@ static int nfs_set_page_writeback(struct struct inode *inode = page->mapping->host; struct nfs_server *nfss = NFS_SERVER(inode); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + nfs_set_congested(atomic_long_inc_return(&nfss->writeback), + nfs_congestion_kb >> (PAGE_SHIFT-10), + &nfss->backing_dev_info); } return ret; } @@ -216,8 +268,11 @@ static void nfs_end_page_writeback(struc struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + + nfs_wakeup_congested(atomic_long_dec_return(&nfss->writeback), + nfs_congestion_kb >> (PAGE_SHIFT-10), + &nfss->backing_dev_info, + nfss->writeback_wait); } static struct nfs_page *nfs_find_and_lock_request(struct page *page) @@ -309,19 +364,34 @@ static int nfs_writepage_locked(struct p int nfs_writepage(struct page *page, struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_writepage_locked(page, wbc); unlock_page(page); + + nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_do_writepage(page, wbc, data); unlock_page(page); + + nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } --- linux.orig/include/linux/nfs_fs_sb.h 2009-11-06 09:22:30.000000000 +0800 +++ linux/include/linux/nfs_fs_sb.h 2009-11-06 09:52:27.000000000 +0800 @@ -108,6 +108,7 @@ struct nfs_server { struct nfs_iostats * io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ + wait_queue_head_t writeback_wait[2]; int flags; /* various flags */ unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ --- linux.orig/fs/nfs/client.c 2009-11-06 09:22:30.000000000 +0800 +++ linux/fs/nfs/client.c 2009-11-06 09:52:27.000000000 +0800 @@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv INIT_LIST_HEAD(&server->master_link); atomic_set(&server->active, 0); + init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]); + init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 1:59 ` Wu Fengguang 2009-12-22 12:35 ` Jan Kara 2009-12-22 13:01 ` Martin Knoblauch @ 2009-12-22 16:41 ` Steve Rago 2009-12-24 1:21 ` Wu Fengguang 2009-12-23 14:21 ` Trond Myklebust 3 siblings, 1 reply; 96+ messages in thread From: Steve Rago @ 2009-12-22 16:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: > Steve, > > On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote: > > > > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote: > > > > > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > > > > Eager Writeback for NFS Clients > > > > > ------------------------------- > > > > > Prevent applications that write large sequential streams of data (like backup, for example) > > > > > from entering into a memory pressure state, which degrades performance by falling back to > > > > > synchronous operations (both synchronous writes and additional commits). > > > > > > What exactly is the "memory pressure state" condition? What's the > > > code to do the "synchronous writes and additional commits" and maybe > > > how they are triggered? > > > > Memory pressure occurs when most of the client pages have been dirtied > > by an application (think backup server writing multi-gigabyte files that > > exceed the size of main memory). The system works harder to be able to > > free dirty pages so that they can be reused. For a local file system, > > this means writing the pages to disk. For NFS, however, the writes > > leave the pages in an "unstable" state until the server responds to a > > commit request. Generally speaking, commit processing is far more > > expensive than write processing on the server; both are done with the > > inode locked, but since the commit takes so long, all writes are > > blocked, which stalls the pipeline. > > Let me try reiterate the problem with code, please correct me if I'm > wrong. > > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS > will set the flag for any pages written -- why this trick? To > guarantee the call of nfs_commit_inode()? Which unfortunately turns > almost every server side NFS write into sync writes.. Not really. The commit needs to be sent, but the writes are still asynchronous. It's just that the pages can't be recycled until they are on stable storage. > > writeback_single_inode: > do_writepages > nfs_writepages > nfs_writepage ----[short time later]---> nfs_writeback_release* > nfs_mark_request_commit > __mark_inode_dirty(I_DIRTY_DATASYNC); > > if (I_DIRTY_SYNC || I_DIRTY_DATASYNC) <---- so this will be true for most time > write_inode > nfs_write_inode > nfs_commit_inode > > > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex, > which blocks all other NFSDs trying to write/writeback the inode. > > nfsd_sync: > take i_mutex > filemap_fdatawrite > filemap_fdatawait > drop i_mutex > > If filemap_fdatawait() can be moved out of i_mutex (or just remove > the lock), we solve the root problem: > > nfsd_sync: > [take i_mutex] > filemap_fdatawrite => can also be blocked, but less a problem > [drop i_mutex] > filemap_fdatawait > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > For correctness or to prevent livelock? I can imagine some livelock > problem here (current implementation can easily wait for extra > pages), however not too hard to fix. Commits and writes on the same inode need to be serialized for consistency (write can change the data and metadata; commit [fsync] needs to provide guarantees that the written data are stable). The performance problem arises because NFS writes are fast (they generally just deposit data into the server's page cache), but commits can take a long time, especially if there is a lot of cached data to flush to stable storage. > > > The proposed patch essentially takes two actions in nfs_file_write() > - to start writeback when the per-file nr_dirty goes high > without committing > - to throttle dirtying when the per-file nr_writeback goes high > I guess this effectively prevents pdflush from kicking in with > its bad committing behavior > > In general it's reasonable to keep NFS per-file nr_dirty low, however > questionable to do per-file nr_writeback throttling. This does not > work well with the global limits - eg. when there are many dirty > files, the summed-up nr_writeback will still grow out of control. Not with the eager writeback patch. The nr_writeback for NFS is limited by the woutstanding tunable parameter multiplied by the number of active NFS files being written. > And it's more likely to impact user visible responsiveness than > a global limit. But my opinion can be biased -- me have a patch to > do global NFS nr_writeback limit ;) What affects user-visible responsiveness is avoiding long delays and avoiding delays that vary widely. Whether the limit is global or per-file is less important (but I'd be happy to be convinced otherwise). Steve > > Thanks, > Fengguang > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 16:41 ` Steve Rago @ 2009-12-24 1:21 ` Wu Fengguang 2009-12-24 14:49 ` Steve Rago 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2009-12-24 1:21 UTC (permalink / raw) To: Steve Rago Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, Dec 23, 2009 at 12:41:53AM +0800, Steve Rago wrote: > > On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: > > Steve, > > > > On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote: > > > > > > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote: > > > > > > > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > > > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > > > > > Eager Writeback for NFS Clients > > > > > > ------------------------------- > > > > > > Prevent applications that write large sequential streams of data (like backup, for example) > > > > > > from entering into a memory pressure state, which degrades performance by falling back to > > > > > > synchronous operations (both synchronous writes and additional commits). > > > > > > > > What exactly is the "memory pressure state" condition? What's the > > > > code to do the "synchronous writes and additional commits" and maybe > > > > how they are triggered? > > > > > > Memory pressure occurs when most of the client pages have been dirtied > > > by an application (think backup server writing multi-gigabyte files that > > > exceed the size of main memory). The system works harder to be able to > > > free dirty pages so that they can be reused. For a local file system, > > > this means writing the pages to disk. For NFS, however, the writes > > > leave the pages in an "unstable" state until the server responds to a > > > commit request. Generally speaking, commit processing is far more > > > expensive than write processing on the server; both are done with the > > > inode locked, but since the commit takes so long, all writes are > > > blocked, which stalls the pipeline. > > > > Let me try reiterate the problem with code, please correct me if I'm > > wrong. > > > > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS > > will set the flag for any pages written -- why this trick? To > > guarantee the call of nfs_commit_inode()? Which unfortunately turns > > almost every server side NFS write into sync writes.. Ah sorry for the typo, here I mean: the commits by pdflush turn most server side NFS _writeback_ into sync ones(ie, datawrite+datawait, with WB_SYNC_ALL). Just to clarify it: write = from user buffer to page cache writeback = from page cache to disk > Not really. The commit needs to be sent, but the writes are still > asynchronous. It's just that the pages can't be recycled until they > are on stable storage. Right. > > > > writeback_single_inode: > > do_writepages > > nfs_writepages > > nfs_writepage ----[short time later]---> nfs_writeback_release* > > nfs_mark_request_commit > > __mark_inode_dirty(I_DIRTY_DATASYNC); > > > > if (I_DIRTY_SYNC || I_DIRTY_DATASYNC) <---- so this will be true for most time > > write_inode > > nfs_write_inode > > nfs_commit_inode > > > > > > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex, > > which blocks all other NFSDs trying to write/writeback the inode. > > > > nfsd_sync: > > take i_mutex > > filemap_fdatawrite > > filemap_fdatawait > > drop i_mutex > > > > If filemap_fdatawait() can be moved out of i_mutex (or just remove > > the lock), we solve the root problem: > > > > nfsd_sync: > > [take i_mutex] > > filemap_fdatawrite => can also be blocked, but less a problem > > [drop i_mutex] > > filemap_fdatawait > > > > Maybe it's a dumb question, but what's the purpose of i_mutex here? > > For correctness or to prevent livelock? I can imagine some livelock > > problem here (current implementation can easily wait for extra > > pages), however not too hard to fix. > > Commits and writes on the same inode need to be serialized for > consistency (write can change the data and metadata; commit [fsync] > needs to provide guarantees that the written data are stable). The > performance problem arises because NFS writes are fast (they generally > just deposit data into the server's page cache), but commits can take a Right. > long time, especially if there is a lot of cached data to flush to > stable storage. "a lot of cached data to flush" is not likely with pdflush, since it roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT syncs 4MB at the server side. Your patch adds another pre-pdlush async write logic, which greatly reduced the number of COMMITs by pdflush. Can this be the major factor of the performance gain? Jan has been proposing to change the pdflush logic from loop over dirty files { writeback 4MB write_inode } to loop over dirty files { writeback all its dirty pages write_inode } This should also be able to reduce the COMMIT numbers. I wonder if this (more general) approach can achieve the same performance gain. > > The proposed patch essentially takes two actions in nfs_file_write() > > - to start writeback when the per-file nr_dirty goes high > > without committing > > - to throttle dirtying when the per-file nr_writeback goes high > > I guess this effectively prevents pdflush from kicking in with > > its bad committing behavior > > > > In general it's reasonable to keep NFS per-file nr_dirty low, however > > questionable to do per-file nr_writeback throttling. This does not > > work well with the global limits - eg. when there are many dirty > > files, the summed-up nr_writeback will still grow out of control. > > Not with the eager writeback patch. The nr_writeback for NFS is limited > by the woutstanding tunable parameter multiplied by the number of active > NFS files being written. Ah yes - _active_ files. That makes it less likely, but still possible. Imagine the summed-up nr_dirty exceeds global limit, and pdflush wakes up. It will cycle through all dirty files and make them all in active NFS write.. It's only a possibility though - NFS writes are fast in normal. > > And it's more likely to impact user visible responsiveness than > > a global limit. But my opinion can be biased -- me have a patch to > > do global NFS nr_writeback limit ;) > > What affects user-visible responsiveness is avoiding long delays and > avoiding delays that vary widely. Whether the limit is global or > per-file is less important (but I'd be happy to be convinced otherwise). For example, one solution is to have max_global_writeback and another is to have max_file_writeback. Then their default values may be max_file_writeback = max_global_writeback / 10 Obviously the smaller max_global_writeback is more likely to block users when active write files < 10, which is the common case. Or, in this fake workload (spike writes from time to time), for i in `seq 1 100` do cp 10MB-$i /nfs/ sleep 1s done When you have 5MB max_file_writeback, the copies will be bumpy, while the max_global_writeback will never kick in.. Note that there is another difference: your per-file nr_writeback throttles _dirtying_ process, while my per-NFS-mount nr_writeback throttles pdflush (then indirectly throttles application). Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-24 1:21 ` Wu Fengguang @ 2009-12-24 14:49 ` Steve Rago 2009-12-25 7:37 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Steve Rago @ 2009-12-24 14:49 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2009-12-24 at 09:21 +0800, Wu Fengguang wrote: > > Commits and writes on the same inode need to be serialized for > > consistency (write can change the data and metadata; commit [fsync] > > needs to provide guarantees that the written data are stable). The > > performance problem arises because NFS writes are fast (they generally > > just deposit data into the server's page cache), but commits can take a > > Right. > > > long time, especially if there is a lot of cached data to flush to > > stable storage. > > "a lot of cached data to flush" is not likely with pdflush, since it > roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT > syncs 4MB at the server side. Maybe on paper, but empirically I see anywhere from one commit per 8MB to one commit per 64 MB. > > Your patch adds another pre-pdlush async write logic, which greatly > reduced the number of COMMITs by pdflush. Can this be the major factor > of the performance gain? My patch removes pdflush from the picture almost entirely. See my comments below. > > Jan has been proposing to change the pdflush logic from > > loop over dirty files { > writeback 4MB > write_inode > } > to > loop over dirty files { > writeback all its dirty pages > write_inode > } > > This should also be able to reduce the COMMIT numbers. I wonder if > this (more general) approach can achieve the same performance gain. The pdflush mechanism is fine for random writes and small sequential writes, because it promotes concurrency -- instead of the application blocking while it tries to write and commit its data, the application can go on doing other more useful things, and the data gets flushed in the background. There is also a benefit if the application makes another modification to a page that is already dirty, because then multiple modifications are coalesced into a single write. However, the pdflush mechanism is wrong for large sequential writes (like a backup stream, for example). First, there is no concurrency to exploit -- the application is only going to dirty more pages, so removing the need for it to block writing the pages out only adds to the problem of memory pressure. Second, the application is not going to go back and modify a page it has already written, so leaving it in the cache for someone else to write provides no additional benefit. Note that this assumes the application actually cares about the consistency of its data and will call fsync() when it is done. If the application doesn't call fsync(), then it doesn't matter when the pages are written to backing store, because the interface makes no guarantees in this case. Thanks, Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-24 14:49 ` Steve Rago @ 2009-12-25 7:37 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2009-12-25 7:37 UTC (permalink / raw) To: Steve Rago Cc: Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Dec 24, 2009 at 10:49:40PM +0800, Steve Rago wrote: > > On Thu, 2009-12-24 at 09:21 +0800, Wu Fengguang wrote: > > > > Commits and writes on the same inode need to be serialized for > > > consistency (write can change the data and metadata; commit [fsync] > > > needs to provide guarantees that the written data are stable). The > > > performance problem arises because NFS writes are fast (they generally > > > just deposit data into the server's page cache), but commits can take a > > > > Right. > > > > > long time, especially if there is a lot of cached data to flush to > > > stable storage. > > > > "a lot of cached data to flush" is not likely with pdflush, since it > > roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT > > syncs 4MB at the server side. > > Maybe on paper, but empirically I see anywhere from one commit per 8MB > to one commit per 64 MB. Thanks for the data. It seems that your CPU works faster than network, so that non of the NFS writes (submitted by L543) return by the time we try to COMMIT at L547. 543 ret = do_writepages(mapping, wbc); 544 545 /* Don't write the inode if only I_DIRTY_PAGES was set */ 546 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 547 int err = write_inode(inode, wait); Thus pdflush is able to do several rounds of do_writepages() before write_inode() can actually collect some pages to be COMMITed. > > > > Your patch adds another pre-pdlush async write logic, which greatly > > reduced the number of COMMITs by pdflush. Can this be the major factor > > of the performance gain? > > My patch removes pdflush from the picture almost entirely. See my > comments below. Yes for sequential async writes, so I said "pre-pdflush" :) > > > > Jan has been proposing to change the pdflush logic from > > > > loop over dirty files { > > writeback 4MB > > write_inode > > } > > to > > loop over dirty files { > > writeback all its dirty pages > > write_inode > > } > > > > This should also be able to reduce the COMMIT numbers. I wonder if > > this (more general) approach can achieve the same performance gain. > > The pdflush mechanism is fine for random writes and small sequential > writes, because it promotes concurrency -- instead of the application > blocking while it tries to write and commit its data, the application > can go on doing other more useful things, and the data gets flushed in > the background. There is also a benefit if the application makes > another modification to a page that is already dirty, because then > multiple modifications are coalesced into a single write. Right. > However, the pdflush mechanism is wrong for large sequential writes > (like a backup stream, for example). First, there is no concurrency to > exploit -- the application is only going to dirty more pages, so > removing the need for it to block writing the pages out only adds to the > problem of memory pressure. Second, the application is not going to go > back and modify a page it has already written, so leaving it in the > cache for someone else to write provides no additional benefit. Well, in general pdflush does more good than bad, that's why we need it. The above two reasons are about "pdflush is not as helpful", but not that it is wrong. That said, I do agree to limit the per-file dirty pages for NFS -- because it tends to flush before simple stat/read operations, which could be costly. > Note that this assumes the application actually cares about the > consistency of its data and will call fsync() when it is done. If the > application doesn't call fsync(), then it doesn't matter when the pages > are written to backing store, because the interface makes no guarantees > in this case. Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 1:59 ` Wu Fengguang ` (2 preceding siblings ...) 2009-12-22 16:41 ` Steve Rago @ 2009-12-23 14:21 ` Trond Myklebust 2009-12-23 18:05 ` Jan Kara 3 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-23 14:21 UTC (permalink / raw) To: Wu Fengguang Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS > will set the flag for any pages written -- why this trick? To > guarantee the call of nfs_commit_inode()? Which unfortunately turns > almost every server side NFS write into sync writes.. > > writeback_single_inode: > do_writepages > nfs_writepages > nfs_writepage ----[short time later]---> nfs_writeback_release* > nfs_mark_request_commit > __mark_inode_dirty(I_DIRTY_DATASYNC); > > if (I_DIRTY_SYNC || I_DIRTY_DATASYNC) <---- so this will be true for most time > write_inode > nfs_write_inode > nfs_commit_inode I have been working on a fix for this. We basically do want to ensure that NFS calls commit (otherwise we're not finished cleaning the dirty pages), but we want to do it _after_ we've waited for all the writes to complete. See below... Trond ------------------------------------------------------------------------------------------------------ VFS: Add a new inode state: I_UNSTABLE_PAGES From: Trond Myklebust <Trond.Myklebust@netapp.com> Add a new inode state to enable the vfs to commit the nfs unstable pages to stable storage once the write back of dirty pages is done. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/fs-writeback.c | 24 ++++++++++++++++++++++-- fs/nfs/inode.c | 13 +++++-------- fs/nfs/write.c | 2 +- include/linux/fs.h | 7 +++++++ 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8..c035efe 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -388,6 +388,14 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static void commit_unstable_pages(struct inode *inode, int wait) +{ + return write_inode(inode, sync); +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(inode, wait); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -481,7 +501,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * More pages get dirtied by a fast dirtier. */ goto select_queue; - } else if (inode->i_state & I_DIRTY) { + } else if (inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES)) { /* * At least XFS will redirty the inode during the * writeback (delalloc) and on io completion (isize). @@ -1050,7 +1070,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..4f129b3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -99,17 +99,14 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_write_inode(struct inode *inode, int sync) { + int flags = 0; int ret; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) + if (sync) + flags = FLUSH_SYNC; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..2f74e44 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int diff --git a/include/linux/fs.h b/include/linux/fs.h index cca1919..ab01af0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1637,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 9 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1651,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 14:21 ` Trond Myklebust @ 2009-12-23 18:05 ` Jan Kara [not found] ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2009-12-23 18:05 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Wed 23-12-09 15:21:47, Trond Myklebust wrote: > On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: > > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS > > will set the flag for any pages written -- why this trick? To > > guarantee the call of nfs_commit_inode()? Which unfortunately turns > > almost every server side NFS write into sync writes.. > > > > writeback_single_inode: > > do_writepages > > nfs_writepages > > nfs_writepage ----[short time later]---> nfs_writeback_release* > > nfs_mark_request_commit > > __mark_inode_dirty(I_DIRTY_DATASYNC); > > > > if (I_DIRTY_SYNC || I_DIRTY_DATASYNC) <---- so this will be true for most time > > write_inode > > nfs_write_inode > > nfs_commit_inode > > > I have been working on a fix for this. We basically do want to ensure > that NFS calls commit (otherwise we're not finished cleaning the dirty > pages), but we want to do it _after_ we've waited for all the writes to > complete. See below... > > Trond > > ------------------------------------------------------------------------------------------------------ > VFS: Add a new inode state: I_UNSTABLE_PAGES > > From: Trond Myklebust <Trond.Myklebust@netapp.com> > > Add a new inode state to enable the vfs to commit the nfs unstable pages to > stable storage once the write back of dirty pages is done. Hmm, does your patch really help? > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > } > > spin_lock(&inode_lock); > + /* > + * Special state for cleaning NFS unstable pages > + */ > + if (inode->i_state & I_UNSTABLE_PAGES) { > + int err; > + inode->i_state &= ~I_UNSTABLE_PAGES; > + spin_unlock(&inode_lock); > + err = commit_unstable_pages(inode, wait); > + if (ret == 0) > + ret = err; > + spin_lock(&inode_lock); > + } I don't quite understand this chunk: We've called writeback_single_inode because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few lines above your chunk, we've called nfs_write_inode which sent commit to the server. Now here you sometimes send the commit again? What's the purpose? > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index faa0918..4f129b3 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -99,17 +99,14 @@ u64 nfs_compat_user_ino64(u64 fileid) > > int nfs_write_inode(struct inode *inode, int sync) > { > + int flags = 0; > int ret; > > - if (sync) { > - ret = filemap_fdatawait(inode->i_mapping); > - if (ret == 0) > - ret = nfs_commit_inode(inode, FLUSH_SYNC); > - } else > - ret = nfs_commit_inode(inode, 0); > - if (ret >= 0) > + if (sync) > + flags = FLUSH_SYNC; > + ret = nfs_commit_inode(inode, flags); > + if (ret > 0) > return 0; > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > return ret; > } > Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> @ 2009-12-23 19:12 ` Trond Myklebust 2009-12-24 2:52 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-23 19:12 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: > On Wed 23-12-09 15:21:47, Trond Myklebust wrote: > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > > } > > > > spin_lock(&inode_lock); > > + /* > > + * Special state for cleaning NFS unstable pages > > + */ > > + if (inode->i_state & I_UNSTABLE_PAGES) { > > + int err; > > + inode->i_state &= ~I_UNSTABLE_PAGES; > > + spin_unlock(&inode_lock); > > + err = commit_unstable_pages(inode, wait); > > + if (ret == 0) > > + ret = err; > > + spin_lock(&inode_lock); > > + } > I don't quite understand this chunk: We've called writeback_single_inode > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few > lines above your chunk, we've called nfs_write_inode which sent commit to > the server. Now here you sometimes send the commit again? What's the > purpose? We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later I_UNSTABLE_PAGES). The point is that we now do the commit only _after_ we've sent all the dirty pages, and waited for writeback to complete, whereas previously we did it in the wrong order. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 19:12 ` Trond Myklebust @ 2009-12-24 2:52 ` Wu Fengguang 2009-12-24 12:04 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2009-12-24 2:52 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Trond, On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote: > On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: > > On Wed 23-12-09 15:21:47, Trond Myklebust wrote: > > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > > > } > > > > > > spin_lock(&inode_lock); > > > + /* > > > + * Special state for cleaning NFS unstable pages > > > + */ > > > + if (inode->i_state & I_UNSTABLE_PAGES) { > > > + int err; > > > + inode->i_state &= ~I_UNSTABLE_PAGES; > > > + spin_unlock(&inode_lock); > > > + err = commit_unstable_pages(inode, wait); > > > + if (ret == 0) > > > + ret = err; > > > + spin_lock(&inode_lock); > > > + } > > I don't quite understand this chunk: We've called writeback_single_inode > > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few > > lines above your chunk, we've called nfs_write_inode which sent commit to > > the server. Now here you sometimes send the commit again? What's the > > purpose? > > We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later > I_UNSTABLE_PAGES). > > The point is that we now do the commit only _after_ we've sent all the > dirty pages, and waited for writeback to complete, whereas previously we > did it in the wrong order. Sorry I still don't get it. The timing used to be: write 4MB ==> WRITE block 0 (ie. first 512KB) WRITE block 1 WRITE block 2 WRITE block 3 ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit) WRITE block 4 ack from server for WRITE block 1 => mark 1 as unstable WRITE block 5 ack from server for WRITE block 2 => mark 2 as unstable WRITE block 6 ack from server for WRITE block 3 => mark 3 as unstable WRITE block 7 ack from server for WRITE block 4 => mark 4 as unstable ack from server for WRITE block 5 => mark 5 as unstable write_inode ==> COMMIT blocks 0-5 ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit) ack from server for WRITE block 7 => mark 7 as unstable ack from server for COMMIT blocks 0-5 => mark 0-5 as clean write_inode ==> COMMIT blocks 6-7 ack from server for COMMIT blocks 6-7 => mark 6-7 as clean Note that the first COMMIT is submitted before receiving all ACKs for the previous writes, hence the second COMMIT is necessary. It seems that your patch does not improve the timing at all. Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-24 2:52 ` Wu Fengguang @ 2009-12-24 12:04 ` Trond Myklebust 2009-12-25 5:56 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-24 12:04 UTC (permalink / raw) To: Wu Fengguang Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2009-12-24 at 10:52 +0800, Wu Fengguang wrote: > Trond, > > On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote: > > On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: > > > On Wed 23-12-09 15:21:47, Trond Myklebust wrote: > > > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > > > > } > > > > > > > > spin_lock(&inode_lock); > > > > + /* > > > > + * Special state for cleaning NFS unstable pages > > > > + */ > > > > + if (inode->i_state & I_UNSTABLE_PAGES) { > > > > + int err; > > > > + inode->i_state &= ~I_UNSTABLE_PAGES; > > > > + spin_unlock(&inode_lock); > > > > + err = commit_unstable_pages(inode, wait); > > > > + if (ret == 0) > > > > + ret = err; > > > > + spin_lock(&inode_lock); > > > > + } > > > I don't quite understand this chunk: We've called writeback_single_inode > > > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few > > > lines above your chunk, we've called nfs_write_inode which sent commit to > > > the server. Now here you sometimes send the commit again? What's the > > > purpose? > > > > We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later > > I_UNSTABLE_PAGES). > > > > The point is that we now do the commit only _after_ we've sent all the > > dirty pages, and waited for writeback to complete, whereas previously we > > did it in the wrong order. > > Sorry I still don't get it. The timing used to be: > > write 4MB ==> WRITE block 0 (ie. first 512KB) > WRITE block 1 > WRITE block 2 > WRITE block 3 ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit) > WRITE block 4 ack from server for WRITE block 1 => mark 1 as unstable > WRITE block 5 ack from server for WRITE block 2 => mark 2 as unstable > WRITE block 6 ack from server for WRITE block 3 => mark 3 as unstable > WRITE block 7 ack from server for WRITE block 4 => mark 4 as unstable > ack from server for WRITE block 5 => mark 5 as unstable > write_inode ==> COMMIT blocks 0-5 > ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit) > ack from server for WRITE block 7 => mark 7 as unstable > > ack from server for COMMIT blocks 0-5 => mark 0-5 as clean > > write_inode ==> COMMIT blocks 6-7 > > ack from server for COMMIT blocks 6-7 => mark 6-7 as clean > > Note that the first COMMIT is submitted before receiving all ACKs for > the previous writes, hence the second COMMIT is necessary. It seems > that your patch does not improve the timing at all. That would indicate that we're cycling through writeback_single_inode() more than once. Why? Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-24 12:04 ` Trond Myklebust @ 2009-12-25 5:56 ` Wu Fengguang 2009-12-30 16:22 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2009-12-25 5:56 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Dec 24, 2009 at 08:04:41PM +0800, Trond Myklebust wrote: > On Thu, 2009-12-24 at 10:52 +0800, Wu Fengguang wrote: > > Trond, > > > > On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote: > > > On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: > > > > On Wed 23-12-09 15:21:47, Trond Myklebust wrote: > > > > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > > > > > } > > > > > > > > > > spin_lock(&inode_lock); > > > > > + /* > > > > > + * Special state for cleaning NFS unstable pages > > > > > + */ > > > > > + if (inode->i_state & I_UNSTABLE_PAGES) { > > > > > + int err; > > > > > + inode->i_state &= ~I_UNSTABLE_PAGES; > > > > > + spin_unlock(&inode_lock); > > > > > + err = commit_unstable_pages(inode, wait); > > > > > + if (ret == 0) > > > > > + ret = err; > > > > > + spin_lock(&inode_lock); > > > > > + } > > > > I don't quite understand this chunk: We've called writeback_single_inode > > > > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few > > > > lines above your chunk, we've called nfs_write_inode which sent commit to > > > > the server. Now here you sometimes send the commit again? What's the > > > > purpose? > > > > > > We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later > > > I_UNSTABLE_PAGES). > > > > > > The point is that we now do the commit only _after_ we've sent all the > > > dirty pages, and waited for writeback to complete, whereas previously we > > > did it in the wrong order. > > > > Sorry I still don't get it. The timing used to be: > > > > write 4MB ==> WRITE block 0 (ie. first 512KB) > > WRITE block 1 > > WRITE block 2 > > WRITE block 3 ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit) > > WRITE block 4 ack from server for WRITE block 1 => mark 1 as unstable > > WRITE block 5 ack from server for WRITE block 2 => mark 2 as unstable > > WRITE block 6 ack from server for WRITE block 3 => mark 3 as unstable > > WRITE block 7 ack from server for WRITE block 4 => mark 4 as unstable > > ack from server for WRITE block 5 => mark 5 as unstable > > write_inode ==> COMMIT blocks 0-5 > > ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit) > > ack from server for WRITE block 7 => mark 7 as unstable > > > > ack from server for COMMIT blocks 0-5 => mark 0-5 as clean > > > > write_inode ==> COMMIT blocks 6-7 > > > > ack from server for COMMIT blocks 6-7 => mark 6-7 as clean > > > > Note that the first COMMIT is submitted before receiving all ACKs for > > the previous writes, hence the second COMMIT is necessary. It seems > > that your patch does not improve the timing at all. > > That would indicate that we're cycling through writeback_single_inode() > more than once. Why? Yes. The above sequence can happen for a 4MB sized dirty file. The first COMMIT is done by L547, while the second COMMIT will be scheduled either by __mark_inode_dirty(), or scheduled by L583 (depending on the time ACKs for L543 but missed L547 arrives: if an ACK missed L578, the inode will be queued into b_dirty list, but if any ACK arrives between L547 and L578, the inode will enter b_more_io_wait, which is a to-be-introduced new dirty list). 537 dirty = inode->i_state & I_DIRTY; 538 inode->i_state |= I_SYNC; 539 inode->i_state &= ~I_DIRTY; 540 541 spin_unlock(&inode_lock); 542 ==> 543 ret = do_writepages(mapping, wbc); 544 545 /* Don't write the inode if only I_DIRTY_PAGES was set */ 546 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { ==> 547 int err = write_inode(inode, wait); 548 if (ret == 0) 549 ret = err; 550 } 551 552 if (wait) { 553 int err = filemap_fdatawait(mapping); 554 if (ret == 0) 555 ret = err; 556 } 557 558 spin_lock(&inode_lock); 559 inode->i_state &= ~I_SYNC; 560 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 561 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 562 /* 563 * We didn't write back all the pages. nfs_writepages() 564 * sometimes bales out without doing anything. 565 */ 566 inode->i_state |= I_DIRTY_PAGES; 567 if (wbc->nr_to_write <= 0) { 568 /* 569 * slice used up: queue for next turn 570 */ 571 requeue_io(inode); 572 } else { 573 /* 574 * somehow blocked: retry later 575 */ 576 requeue_io_wait(inode); 577 } ==> 578 } else if (inode->i_state & I_DIRTY) { 579 /* 580 * At least XFS will redirty the inode during the 581 * writeback (delalloc) and on io completion (isize). 582 */ ==> 583 requeue_io_wait(inode); Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-25 5:56 ` Wu Fengguang @ 2009-12-30 16:22 ` Trond Myklebust 2009-12-31 5:04 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-30 16:22 UTC (permalink / raw) To: Wu Fengguang Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, 2009-12-25 at 13:56 +0800, Wu Fengguang wrote: > On Thu, Dec 24, 2009 at 08:04:41PM +0800, Trond Myklebust wrote: > > That would indicate that we're cycling through writeback_single_inode() > > more than once. Why? > > Yes. The above sequence can happen for a 4MB sized dirty file. > The first COMMIT is done by L547, while the second COMMIT will be > scheduled either by __mark_inode_dirty(), or scheduled by L583 > (depending on the time ACKs for L543 but missed L547 arrives: > if an ACK missed L578, the inode will be queued into b_dirty list, > but if any ACK arrives between L547 and L578, the inode will enter > b_more_io_wait, which is a to-be-introduced new dirty list). > > 537 dirty = inode->i_state & I_DIRTY; > 538 inode->i_state |= I_SYNC; > 539 inode->i_state &= ~I_DIRTY; > 540 > 541 spin_unlock(&inode_lock); > 542 > ==> 543 ret = do_writepages(mapping, wbc); > 544 > 545 /* Don't write the inode if only I_DIRTY_PAGES was set */ > 546 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { > ==> 547 int err = write_inode(inode, wait); > 548 if (ret == 0) > 549 ret = err; > 550 } > 551 > 552 if (wait) { > 553 int err = filemap_fdatawait(mapping); > 554 if (ret == 0) > 555 ret = err; > 556 } > 557 > 558 spin_lock(&inode_lock); > 559 inode->i_state &= ~I_SYNC; > 560 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { > 561 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { > 562 /* > 563 * We didn't write back all the pages. nfs_writepages() > 564 * sometimes bales out without doing anything. > 565 */ > 566 inode->i_state |= I_DIRTY_PAGES; > 567 if (wbc->nr_to_write <= 0) { > 568 /* > 569 * slice used up: queue for next turn > 570 */ > 571 requeue_io(inode); > 572 } else { > 573 /* > 574 * somehow blocked: retry later > 575 */ > 576 requeue_io_wait(inode); > 577 } > ==> 578 } else if (inode->i_state & I_DIRTY) { > 579 /* > 580 * At least XFS will redirty the inode during the > 581 * writeback (delalloc) and on io completion (isize). > 582 */ > ==> 583 requeue_io_wait(inode); Hi Fengguang, Apologies for having taken time over this. Do you see any improvement with the appended variant instead? It adds a new address_space_operation in order to do the commit. Furthermore, it ignores the commit request if the caller is just doing a WB_SYNC_NONE background flush, waiting instead for the ensuing WB_SYNC_ALL request... Cheers Trond -------------------------------------------------------------------------------------------------------- VFS: Add a new inode state: I_UNSTABLE_PAGES From: Trond Myklebust <Trond.Myklebust@netapp.com> Add a new inode state to enable the vfs to commit the nfs unstable pages to stable storage once the write back of dirty pages is done. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/fs-writeback.c | 27 +++++++++++++++++++++++++-- fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 29 ++++++++++++++++++++++++++++- include/linux/fs.h | 9 +++++++++ 7 files changed, 65 insertions(+), 22 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8..24bc817 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static int commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) + return mapping->a_ops->commit_unstable_pages(mapping, wbc); + return 0; +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(mapping, wbc); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -481,7 +504,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * More pages get dirtied by a fast dirtier. */ goto select_queue; - } else if (inode->i_state & I_DIRTY) { + } else if (inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES)) { /* * At least XFS will redirty the inode during the * writeback (delalloc) and on io completion (isize). @@ -1050,7 +1073,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..67e50ac 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, + .commit_unstable_pages = nfs_commit_unstable_pages, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..8341709 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d..7bb326f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..805c1a0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..187f3a9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int @@ -1406,11 +1406,38 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int flags = FLUSH_SYNC; + int ret; + + /* Don't commit if this is just a non-blocking flush */ + if (wbc->sync_mode != WB_SYNC_ALL) { + mark_inode_unstable_pages(inode); + return 0; + } + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) + return 0; + return ret; +} + #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca8..ea0b7a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -602,6 +602,8 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*commit_unstable_pages)(struct address_space *, + struct writeback_control *); }; /* @@ -1635,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 9 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-30 16:22 ` Trond Myklebust @ 2009-12-31 5:04 ` Wu Fengguang 2009-12-31 19:13 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2009-12-31 5:04 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Trond, On Thu, Dec 31, 2009 at 12:22:48AM +0800, Trond Myklebust wrote: > it ignores the commit request if the caller is just doing a > WB_SYNC_NONE background flush, waiting instead for the ensuing > WB_SYNC_ALL request... I'm afraid this will block balance_dirty_pages() until explicit sync/fsync calls: COMMITs are bad, however if we don't send them regularly, NR_UNSTABLE_NFS will grow large and block balance_dirty_pages() as well as throttle_vm_writeout().. > +int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + struct inode *inode = mapping->host; > + int flags = FLUSH_SYNC; > + int ret; > + ==> > + /* Don't commit if this is just a non-blocking flush */ ==> > + if (wbc->sync_mode != WB_SYNC_ALL) { ==> > + mark_inode_unstable_pages(inode); ==> > + return 0; ==> > + } > + if (wbc->nonblocking) > + flags = 0; > + ret = nfs_commit_inode(inode, flags); > + if (ret > 0) > + return 0; > + return ret; > +} The NFS protocol provides no painless way to reclaim unstable pages other than the COMMIT (or sync write).. This leaves us in a dilemma. We may reasonably reduce the number of COMMITs, and possibly even delay them for a while (and hope the server have writeback the pages before the COMMIT, somehow fragile). What we can obviously do is to avoid sending a COMMIT - if there are already an ongoing COMMIT for the same inode - or when there are ongoing WRITE for the inode (are there easy way to detect this?) What do you think? Thanks, Fengguang --- fs/nfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) --- linux.orig/fs/nfs/inode.c 2009-12-25 09:25:38.000000000 +0800 +++ linux/fs/nfs/inode.c 2009-12-25 10:13:06.000000000 +0800 @@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode, ret = filemap_fdatawait(inode->i_mapping); if (ret == 0) ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else + } else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, + NFS_PAGE_TAG_LOCKED)) ret = nfs_commit_inode(inode, 0); + else + ret = -EAGAIN; if (ret >= 0) return 0; __mark_inode_dirty(inode, I_DIRTY_DATASYNC); ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-31 5:04 ` Wu Fengguang @ 2009-12-31 19:13 ` Trond Myklebust 2010-01-06 3:03 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-31 19:13 UTC (permalink / raw) To: Wu Fengguang Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2009-12-31 at 13:04 +0800, Wu Fengguang wrote: > --- > fs/nfs/inode.c | 5 ++++- > 1 file changed, 4 insertions(+), 1 deletion(-) > > --- linux.orig/fs/nfs/inode.c 2009-12-25 09:25:38.000000000 +0800 > +++ linux/fs/nfs/inode.c 2009-12-25 10:13:06.000000000 +0800 > @@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode, > ret = filemap_fdatawait(inode->i_mapping); > if (ret == 0) > ret = nfs_commit_inode(inode, FLUSH_SYNC); > - } else > + } else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > + NFS_PAGE_TAG_LOCKED)) > ret = nfs_commit_inode(inode, 0); > + else > + ret = -EAGAIN; > if (ret >= 0) > return 0; > __mark_inode_dirty(inode, I_DIRTY_DATASYNC); The above change improves on the existing code, but doesn't solve the problem that write_inode() isn't a good match for COMMIT. We need to wait for all the unstable WRITE rpc calls to return before we can know whether or not a COMMIT is needed (some commercial servers never require commit, even if the client requested an unstable write). That was the other reason for the change. I do, however, agree that the above can provide a nice heuristic for the WB_SYNC_NONE case (minus the -EAGAIN error). Mind if I integrate it? Cheers (and Happy New Year!) Trond ------------------------------------------------------------------------------------------------------------ VFS: Ensure that writeback_single_inode() commits unstable writes From: Trond Myklebust <Trond.Myklebust@netapp.com> If the call to do_writepages() succeeded in starting writeback, we do not know whether or not we will need to COMMIT any unstable writes until after the write RPC calls are finished. Currently, we assume that at least one write RPC call will have finished, and set I_DIRTY_DATASYNC by the time do_writepages is done, so that write_inode() is triggered. In order to ensure reliable operation (i.e. ensure that a single call to writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages are on disk) we need to first wait for filemap_fdatawait() to complete, then test for unstable pages. Since NFS is currently the only filesystem that has unstable pages, we can add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, this will trigger a callback to a new address_space_operation to call the COMMIT. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- include/linux/fs.h | 9 +++++++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f6c2155..b25efbb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static int commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) + return mapping->a_ops->commit_unstable_pages(mapping, wbc); + return 0; +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(mapping, wbc); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -532,6 +555,12 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } + } else if (inode->i_state & I_UNSTABLE_PAGES) { + /* + * The inode has got yet more unstable pages to + * commit. Requeue on b_more_io + */ + requeue_io(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -1050,7 +1079,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..67e50ac 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, + .commit_unstable_pages = nfs_commit_unstable_pages, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..8341709 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d..7bb326f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..805c1a0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..910be28 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int flags = FLUSH_SYNC; + int ret; + + /* Don't commit yet if this is a non-blocking flush and there are + * outstanding writes for this mapping. + */ + if (wbc->sync_mode != WB_SYNC_ALL && + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, + NFS_PAGE_TAG_LOCKED)) { + mark_inode_unstable_pages(inode); + return 0; + } + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) + ret = 0; + return ret; +} + #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca8..ea0b7a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -602,6 +602,8 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*commit_unstable_pages)(struct address_space *, + struct writeback_control *); }; /* @@ -1635,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 9 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-31 19:13 ` Trond Myklebust @ 2010-01-06 3:03 ` Wu Fengguang 2010-01-06 16:56 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-06 3:03 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Trond, On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote: > On Thu, 2009-12-31 at 13:04 +0800, Wu Fengguang wrote: > > > --- > > fs/nfs/inode.c | 5 ++++- > > 1 file changed, 4 insertions(+), 1 deletion(-) > > > > --- linux.orig/fs/nfs/inode.c 2009-12-25 09:25:38.000000000 +0800 > > +++ linux/fs/nfs/inode.c 2009-12-25 10:13:06.000000000 +0800 > > @@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode, > > ret = filemap_fdatawait(inode->i_mapping); > > if (ret == 0) > > ret = nfs_commit_inode(inode, FLUSH_SYNC); > > - } else > > + } else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > > + NFS_PAGE_TAG_LOCKED)) > > ret = nfs_commit_inode(inode, 0); > > + else > > + ret = -EAGAIN; > > if (ret >= 0) > > return 0; > > __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > The above change improves on the existing code, but doesn't solve the > problem that write_inode() isn't a good match for COMMIT. We need to > wait for all the unstable WRITE rpc calls to return before we can know > whether or not a COMMIT is needed (some commercial servers never require > commit, even if the client requested an unstable write). That was the > other reason for the change. Ah good to know that reason. However we cannot wait for ongoing WRITEs for unlimited time or pages, otherwise nr_unstable goes up and squeeze nr_dirty and nr_writeback to zero, and stall the cp process for a long time, as demonstrated by the trace (more reasoning in previous email). > > I do, however, agree that the above can provide a nice heuristic for the > WB_SYNC_NONE case (minus the -EAGAIN error). Mind if I integrate it? Sure, thank you. Here is the trace I collected with this patch. The pipeline is often stalled and throughput is poor.. Thanks, Fengguang % vmmon -d 1 nr_writeback nr_dirty nr_unstable nr_writeback nr_dirty nr_unstable 0 0 0 0 0 0 0 0 0 31609 71540 146 45293 60500 2832 44418 58964 5246 44927 55903 7806 44672 55901 8064 44159 52840 11646 43120 51317 14224 43556 48256 16857 42532 46728 19417 43044 43672 21977 42093 42144 24464 40999 40621 27097 41508 37560 29657 40612 36032 32089 41600 34509 32640 41600 34509 32640 41600 34509 32640 41454 32976 34319 40466 31448 36843 nr_writeback nr_dirty nr_unstable 39699 29920 39146 40210 26864 41707 39168 25336 44285 38126 25341 45330 38144 25341 45312 37779 23808 47210 38254 20752 49807 37358 19224 52239 36334 19229 53266 36352 17696 54781 35438 16168 57231 35496 13621 59736 47463 0 61420 47421 0 61440 44389 0 64472 41829 0 67032 39342 0 69519 39357 0 69504 36656 0 72205 34131 0 74730 31717 0 77144 31165 0 77696 28975 0 79886 26451 0 82410 nr_writeback nr_dirty nr_unstable 23873 0 84988 22992 0 85869 21586 0 87275 19027 0 89834 16467 0 92394 14765 0 94096 14781 0 94080 12080 0 96781 9391 0 99470 6831 0 102030 6589 0 102272 6589 0 102272 3669 0 105192 1089 0 107772 44 0 108817 0 0 108861 0 0 108861 35186 71874 1679 32626 71913 4238 30121 71913 6743 28802 71913 8062 26610 71913 10254 36953 59138 12686 34473 59114 15191 nr_writeback nr_dirty nr_unstable 33446 59114 16218 33408 59114 16256 30707 59114 18957 28183 59114 21481 25988 59114 23676 25253 59114 24411 25216 59114 24448 22953 59114 26711 35351 44274 29161 32645 44274 31867 32384 44274 32128 32384 44274 32128 32384 44274 32128 28928 44274 35584 26350 44274 38162 26112 44274 38400 26112 44274 38400 26112 44274 38400 22565 44274 41947 36989 27364 44434 35440 27379 45968 32805 27379 48603 30245 27379 51163 28672 27379 52736 nr_writeback nr_dirty nr_unstable 56047 4 52736 56051 0 52736 56051 0 52736 56051 0 52736 56051 0 52736 54279 0 54508 51846 0 56941 49158 0 59629 47987 0 60800 47987 0 60800 47987 0 60800 47987 0 60800 47987 0 60800 47987 0 60800 44612 0 62976 42228 0 62976 39650 0 62976 37236 0 62976 34658 0 62976 32226 0 62976 29722 0 62976 27161 0 62976 24674 0 62976 22242 0 62976 nr_writeback nr_dirty nr_unstable 19737 0 62976 17306 0 62976 14745 0 62976 12313 0 62976 9753 0 62976 7321 0 62976 4743 0 62976 2329 0 62976 43 0 14139 0 0 0 0 0 0 0 0 0 wfg ~% dstat ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- usr sys idl wai hiq siq| read writ| recv send| in out | int csw 2 9 89 0 0 0| 0 0 | 729B 720B| 0 0 | 875 2136 6 9 76 8 0 1| 0 352k|9532B 4660B| 0 0 |1046 2091 3 8 89 0 0 0| 0 0 |1153B 426B| 0 0 | 870 1870 1 9 89 0 0 0| 0 72k|1218B 246B| 0 0 | 853 1757 3 8 89 0 0 0| 0 0 | 844B 66B| 0 0 | 865 1695 2 7 91 0 0 0| 0 0 | 523B 66B| 0 0 | 818 1576 3 7 90 0 0 0| 0 0 | 901B 66B| 0 0 | 820 1590 6 11 68 11 0 4| 0 456k|2028k 51k| 0 0 |1560 2756 7 21 52 0 0 20| 0 0 | 11M 238k| 0 0 |4627 7423 2 22 51 0 0 24| 0 80k| 10M 230k| 0 0 |4200 6469 4 19 54 0 0 23| 0 0 | 10M 236k| 0 0 |4277 6629 3 15 37 31 0 14| 0 64M|5377k 115k| 0 0 |2229 2972 3 27 45 0 0 26| 0 0 | 10M 237k| 0 0 |4416 6743 3 20 51 0 0 27| 0 1024k| 10M 233k| 0 0 |4284 6694 ^C ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- usr sys idl wai hiq siq| read writ| recv send| in out | int csw 5 9 84 2 0 1| 225k 443k| 0 0 | 0 0 | 950 1985 4 28 25 22 0 21| 0 62M| 10M 235k| 0 0 |4529 6686 5 23 30 11 0 31| 0 23M| 10M 239k| 0 0 |4570 6948 2 24 48 0 0 26| 0 0 | 10M 234k| 0 0 |4334 6796 2 25 34 17 0 22| 0 50M| 10M 236k| 0 0 |4546 6944 2 29 46 7 0 18| 0 14M| 10M 236k| 0 0 |4411 6998 2 23 53 0 0 22| 0 0 | 10M 232k| 0 0 |4100 6595 3 19 20 32 0 26| 0 39M|9466k 207k| 0 0 |3455 4617 2 13 40 43 0 1| 0 41M| 930B 264B| 0 0 | 906 1545 3 7 45 43 0 1| 0 57M| 713B 132B| 0 0 | 859 1669 3 9 47 40 0 1| 0 54M| 376B 66B| 0 0 | 944 1741 5 25 47 0 0 21| 0 16k|9951k 222k| 0 0 |4227 6697 5 20 38 14 0 23| 0 36M|9388k 204k| 0 0 |3650 5135 3 28 46 0 0 24| 0 8192B| 11M 241k| 0 0 |4612 7115 2 24 49 0 0 25| 0 0 | 10M 234k| 0 0 |4120 6477 2 25 37 12 0 23| 0 56M| 11M 239k| 0 0 |4406 6237 3 7 38 44 0 7| 0 48M|1529k 32k| 0 0 |1071 1635 3 8 41 45 0 2| 0 58M| 602B 198B| 0 0 | 886 1613 2 25 45 2 0 27| 0 2056k| 10M 228k| 0 0 |4233 6623 2 24 49 0 0 24| 0 0 | 10M 235k| 0 0 |4292 6815 2 27 41 8 0 22| 0 50M| 10M 234k| 0 0 |4381 6394 1 9 41 41 0 7| 0 59M|1790k 38k| 0 0 |1226 1823 2 26 40 10 0 22| 0 17M|8185k 183k| 0 0 |3584 5410 1 23 54 0 0 22| 0 0 | 10M 228k| 0 0 |4153 6672 1 22 49 0 0 28| 0 37M| 11M 239k| 0 0 |4499 6938 2 15 37 32 0 13| 0 57M|5078k 110k| 0 0 |2154 2903 3 20 45 21 0 10| 0 31M|4268k 96k| 0 0 |2338 3712 2 21 55 0 0 21| 0 0 | 10M 231k| 0 0 |4292 6940 2 22 49 0 0 27| 0 25M| 11M 238k| 0 0 |4338 6677 2 17 42 19 0 19| 0 53M|8269k 180k| 0 0 |3341 4501 3 17 45 33 0 2| 0 50M|2083k 49k| 0 0 |1778 2733 2 23 53 0 0 22| 0 0 | 11M 240k| 0 0 |4482 7108 2 23 51 0 0 25| 0 9792k| 10M 230k| 0 0 |4220 6563 3 21 38 15 0 24| 0 53M| 11M 240k| 0 0 |4038 5697 3 10 41 43 0 3| 0 65M| 80k 660B| 0 0 | 984 1725 1 23 51 0 0 25| 0 8192B| 10M 230k| 0 0 |4301 6652 2 21 48 0 0 29| 0 0 | 10M 237k| 0 0 |4267 6956 2 26 43 5 0 23| 0 52M| 10M 236k| 0 0 |4553 6764 7 7 34 41 0 10| 0 57M|2596k 56k| 0 0 |1210 1680 6 21 44 12 0 17| 0 19M|7053k 158k| 0 0 |3194 4902 4 24 51 0 0 21| 0 0 | 10M 237k| 0 0 |4406 6724 4 22 53 0 0 21| 0 31M| 10M 237k| 0 0 |4752 7286 4 15 32 32 0 17| 0 49M|5777k 125k| 0 0 |2379 3015 5 14 43 34 0 3| 0 48M|1781k 42k| 0 0 |1578 2492 4 22 42 0 0 32| 0 0 | 10M 236k| 0 0 |4318 6763 3 22 50 4 0 21| 0 7072k| 10M 236k| 0 0 |4509 6859 6 21 28 16 0 28| 0 41M| 11M 241k| 0 0 |4289 5928 7 8 39 44 0 2| 0 40M| 217k 3762B| 0 0 |1024 1763 4 15 46 28 0 6| 0 39M|2377k 55k| 0 0 |1683 2678 4 24 45 0 0 26| 0 0 | 10M 232k| 0 0 |4207 6596 3 24 50 5 0 19| 0 10M|9472k 210k| 0 0 |3976 6122 5 7 40 46 0 1| 0 32M|1230B 66B| 0 0 | 967 1676 ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- usr sys idl wai hiq siq| read writ| recv send| in out | int csw 5 7 47 40 0 1| 0 39M| 651B 66B| 0 0 | 916 1583 4 12 54 22 0 7| 0 35M|1815k 41k| 0 0 |1448 2383 4 22 52 0 0 21| 0 0 | 10M 233k| 0 0 |4258 6705 4 22 52 0 0 22| 0 24M| 10M 236k| 0 0 |4480 7097 3 23 48 0 0 26| 0 28M| 10M 234k| 0 0 |4402 6798 5 12 36 29 0 19| 0 59M|5464k 118k| 0 0 |2358 2963 4 26 47 4 0 19| 0 5184k|8684k 194k| 0 0 |3786 5852 4 22 43 0 0 32| 0 0 | 10M 233k| 0 0 |4350 6779 3 26 44 0 0 27| 0 36M| 10M 233k| 0 0 |4360 6619 4 11 39 33 0 13| 0 46M|4545k 98k| 0 0 |2159 2600 3 14 40 40 0 2| 0 46M| 160k 4198B| 0 0 |1070 1610 4 25 45 0 0 27| 0 0 | 10M 236k| 0 0 |4435 6760 4 25 48 0 0 24| 0 3648k| 10M 235k| 0 0 |4595 6950 3 24 29 22 0 21| 0 37M| 10M 236k| 0 0 |4335 6461 5 11 42 36 0 6| 0 45M|2257k 48k| 0 0 |1440 1755 5 6 41 47 0 1| 0 43M| 768B 198B| 0 0 | 989 1592 5 30 47 3 0 15| 0 24k|8598k 192k| 0 0 |3694 5580 2 23 49 0 0 26| 0 0 | 10M 229k| 0 0 |4319 6805 4 22 32 20 0 22| 0 26M| 10M 234k| 0 0 |4487 6751 4 11 24 53 0 8| 0 32M|2503k 55k| 0 0 |1287 1654 8 10 42 39 0 0| 0 43M|1783B 132B| 0 0 |1054 1900 6 16 43 27 0 8| 0 24M|2790k 64k| 0 0 |2150 3370 4 24 51 0 0 21| 0 0 | 10M 231k| 0 0 |4308 6589 3 24 36 13 0 24| 0 9848k| 10M 231k| 0 0 |4394 6742 6 10 11 62 0 9| 0 27M|2519k 55k| 0 0 |1482 1723 3 12 23 61 0 2| 0 34M| 608B 132B| 0 0 | 927 1623 3 15 38 38 0 6| 0 36M|2077k 48k| 0 0 |1801 2651 7 25 45 6 0 17| 0 3000k| 11M 241k| 0 0 |5071 7687 3 26 45 3 0 23| 0 13M| 11M 238k| 0 0 |4473 6650 4 17 40 21 0 17| 0 37M|6253k 139k| 0 0 |2891 3746 3 24 48 0 0 25| 0 0 | 10M 238k| 0 0 |4736 7189 1 28 38 7 0 25| 0 9160k| 10M 232k| 0 0 |4689 7026 4 17 26 35 0 18| 0 21M|8707k 190k| 0 0 |3346 4488 4 11 12 72 0 1| 0 29M|1459B 264B| 0 0 | 947 1643 4 10 20 64 0 1| 0 28M| 728B 132B| 0 0 |1010 1531 6 8 7 78 0 1| 0 25M| 869B 66B| 0 0 | 945 1620 5 10 15 69 0 1| 0 27M| 647B 132B| 0 0 |1052 1553 5 11 0 82 0 1| 0 16M| 724B 66B| 0 0 |1063 1679 3 22 18 49 0 9| 0 14M|4560k 103k| 0 0 |2931 4039 3 24 44 0 0 29| 0 0 | 10M 236k| 0 0 |4863 7497 3 30 42 0 0 24| 0 4144k| 11M 250k| 0 0 |5505 7945 3 18 13 45 0 20| 0 15M|7234k 157k| 0 0 |3197 4021 7 9 0 82 0 1| 0 23M| 356B 198B| 0 0 | 979 1738 3 11 9 77 0 0| 0 22M| 802B 132B| 0 0 | 994 1635 5 9 1 84 0 2| 0 31M| 834B 66B| 0 0 | 996 1534 4 10 14 71 0 1| 0 20M| 288B 132B| 0 0 | 976 1627 4 14 22 59 0 1| 0 8032k| 865k 20k| 0 0 |1222 1589 4 23 46 0 0 26| 0 0 | 10M 239k| 0 0 |3791 5035 5 17 43 6 0 29| 0 17M| 10M 233k| 0 0 |3198 4372 4 19 50 0 0 27| 0 0 | 10M 231k| 0 0 |2952 4447 5 19 37 14 0 26| 0 8568k| 10M 227k| 0 0 |3562 5251 3 21 23 25 0 28| 0 9560k| 10M 230k| 0 0 |3390 5038 ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- usr sys idl wai hiq siq| read writ| recv send| in out | int csw 5 19 24 26 0 26| 0 11M| 10M 229k| 0 0 |3282 4749 4 20 8 39 0 28| 0 7992k| 10M 230k| 0 0 |3302 4488 4 17 3 47 0 30| 0 8616k| 10M 231k| 0 0 |3440 4909 5 16 22 25 0 31| 0 6556k| 10M 227k| 0 0 |3291 4671 3 18 22 24 0 32| 0 5588k| 10M 230k| 0 0 |3345 4822 4 16 26 25 0 29| 0 4744k| 10M 230k| 0 0 |3331 4854 3 18 16 37 0 26| 0 4296k| 10M 228k| 0 0 |3056 4139 3 17 18 25 0 36| 0 3016k| 10M 230k| 0 0 |3239 4623 4 19 23 26 0 27| 0 2216k| 10M 229k| 0 0 |3331 4777 4 20 41 8 0 26| 0 8584k| 10M 228k| 0 0 |3434 5114 4 17 50 0 0 29| 0 1000k| 10M 229k| 0 0 |3151 4878 2 18 50 1 0 29| 0 32k| 10M 232k| 0 0 |3176 4951 3 19 51 0 0 28| 0 0 | 10M 232k| 0 0 |3014 4567 4 17 53 1 0 24| 0 32k|8787k 195k| 0 0 |2768 4382 3 8 89 0 0 0| 0 0 |4013B 2016B| 0 0 | 866 1653 3 8 88 0 0 0| 0 16k|1017B 0 | 0 0 | 828 1660 6 8 86 0 0 0| 0 0 |1320B 66B| 0 0 | 821 1713 4 8 88 0 0 0| 0 0 | 692B 66B| 0 0 | 806 1665 > ------------------------------------------------------------------------------------------------------------ > VFS: Ensure that writeback_single_inode() commits unstable writes > > From: Trond Myklebust <Trond.Myklebust@netapp.com> > > If the call to do_writepages() succeeded in starting writeback, we do not > know whether or not we will need to COMMIT any unstable writes until after > the write RPC calls are finished. Currently, we assume that at least one > write RPC call will have finished, and set I_DIRTY_DATASYNC by the time > do_writepages is done, so that write_inode() is triggered. > > In order to ensure reliable operation (i.e. ensure that a single call to > writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages > are on disk) we need to first wait for filemap_fdatawait() to complete, > then test for unstable pages. > > Since NFS is currently the only filesystem that has unstable pages, we can > add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, > this will trigger a callback to a new address_space_operation to call the > COMMIT. > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > --- > > fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- > fs/nfs/file.c | 1 + > fs/nfs/inode.c | 16 ---------------- > fs/nfs/internal.h | 3 ++- > fs/nfs/super.c | 2 -- > fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- > include/linux/fs.h | 9 +++++++++ > 7 files changed, 74 insertions(+), 21 deletions(-) > > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index f6c2155..b25efbb 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync) > } > > /* > + * Commit the NFS unstable pages. > + */ > +static int commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) > + return mapping->a_ops->commit_unstable_pages(mapping, wbc); > + return 0; > +} > + > +/* > * Wait for writeback on an inode to complete. > */ > static void inode_wait_for_writeback(struct inode *inode) > @@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > } > > spin_lock(&inode_lock); > + /* > + * Special state for cleaning NFS unstable pages > + */ > + if (inode->i_state & I_UNSTABLE_PAGES) { > + int err; > + inode->i_state &= ~I_UNSTABLE_PAGES; > + spin_unlock(&inode_lock); > + err = commit_unstable_pages(mapping, wbc); > + if (ret == 0) > + ret = err; > + spin_lock(&inode_lock); > + } > inode->i_state &= ~I_SYNC; > if (!(inode->i_state & (I_FREEING | I_CLEAR))) { > if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { > @@ -532,6 +555,12 @@ select_queue: > inode->i_state |= I_DIRTY_PAGES; > redirty_tail(inode); > } > + } else if (inode->i_state & I_UNSTABLE_PAGES) { > + /* > + * The inode has got yet more unstable pages to > + * commit. Requeue on b_more_io > + */ > + requeue_io(inode); > } else if (atomic_read(&inode->i_count)) { > /* > * The inode is clean, inuse > @@ -1050,7 +1079,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) > > spin_lock(&inode_lock); > if ((inode->i_state & flags) != flags) { > - const int was_dirty = inode->i_state & I_DIRTY; > + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); > > inode->i_state |= flags; > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c > index 6b89132..67e50ac 100644 > --- a/fs/nfs/file.c > +++ b/fs/nfs/file.c > @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { > .migratepage = nfs_migrate_page, > .launder_page = nfs_launder_page, > .error_remove_page = generic_error_remove_page, > + .commit_unstable_pages = nfs_commit_unstable_pages, > }; > > /* > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index faa0918..8341709 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) > return ino; > } > > -int nfs_write_inode(struct inode *inode, int sync) > -{ > - int ret; > - > - if (sync) { > - ret = filemap_fdatawait(inode->i_mapping); > - if (ret == 0) > - ret = nfs_commit_inode(inode, FLUSH_SYNC); > - } else > - ret = nfs_commit_inode(inode, 0); > - if (ret >= 0) > - return 0; > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > - return ret; > -} > - > void nfs_clear_inode(struct inode *inode) > { > /* > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h > index 29e464d..7bb326f 100644 > --- a/fs/nfs/internal.h > +++ b/fs/nfs/internal.h > @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); > extern struct workqueue_struct *nfsiod_workqueue; > extern struct inode *nfs_alloc_inode(struct super_block *sb); > extern void nfs_destroy_inode(struct inode *); > -extern int nfs_write_inode(struct inode *,int); > extern void nfs_clear_inode(struct inode *); > #ifdef CONFIG_NFS_V4 > extern void nfs4_clear_inode(struct inode *); > @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, > extern void nfs_read_prepare(struct rpc_task *task, void *calldata); > > /* write.c */ > +extern int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc); > extern void nfs_write_prepare(struct rpc_task *task, void *calldata); > #ifdef CONFIG_MIGRATION > extern int nfs_migrate_page(struct address_space *, > diff --git a/fs/nfs/super.c b/fs/nfs/super.c > index ce907ef..805c1a0 100644 > --- a/fs/nfs/super.c > +++ b/fs/nfs/super.c > @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { > static const struct super_operations nfs_sops = { > .alloc_inode = nfs_alloc_inode, > .destroy_inode = nfs_destroy_inode, > - .write_inode = nfs_write_inode, > .statfs = nfs_statfs, > .clear_inode = nfs_clear_inode, > .umount_begin = nfs_umount_begin, > @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { > static const struct super_operations nfs4_sops = { > .alloc_inode = nfs_alloc_inode, > .destroy_inode = nfs_destroy_inode, > - .write_inode = nfs_write_inode, > .statfs = nfs_statfs, > .clear_inode = nfs4_clear_inode, > .umount_begin = nfs_umount_begin, > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > index d171696..910be28 100644 > --- a/fs/nfs/write.c > +++ b/fs/nfs/write.c > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > spin_unlock(&inode->i_lock); > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > + mark_inode_unstable_pages(inode); > } > > static int > @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) > } > return res; > } > + > +int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + struct inode *inode = mapping->host; > + int flags = FLUSH_SYNC; > + int ret; > + > + /* Don't commit yet if this is a non-blocking flush and there are > + * outstanding writes for this mapping. > + */ > + if (wbc->sync_mode != WB_SYNC_ALL && > + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > + NFS_PAGE_TAG_LOCKED)) { > + mark_inode_unstable_pages(inode); > + return 0; > + } > + if (wbc->nonblocking) > + flags = 0; > + ret = nfs_commit_inode(inode, flags); > + if (ret > 0) > + ret = 0; > + return ret; > +} > + > #else > static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) > { > return 0; > } > + > +int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + return 0; > +} > #endif > > long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 9147ca8..ea0b7a3 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -602,6 +602,8 @@ struct address_space_operations { > int (*is_partially_uptodate) (struct page *, read_descriptor_t *, > unsigned long); > int (*error_remove_page)(struct address_space *, struct page *); > + int (*commit_unstable_pages)(struct address_space *, > + struct writeback_control *); > }; > > /* > @@ -1635,6 +1637,8 @@ struct super_operations { > #define I_CLEAR 64 > #define __I_SYNC 7 > #define I_SYNC (1 << __I_SYNC) > +#define __I_UNSTABLE_PAGES 9 > +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) > > #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) > > @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) > __mark_inode_dirty(inode, I_DIRTY_SYNC); > } > > +static inline void mark_inode_unstable_pages(struct inode *inode) > +{ > + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); > +} > + > /** > * inc_nlink - directly increment an inode's link count > * @inode: inode > ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 3:03 ` Wu Fengguang @ 2010-01-06 16:56 ` Trond Myklebust 2010-01-06 18:26 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 16:56 UTC (permalink / raw) To: Wu Fengguang Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 11:03 +0800, Wu Fengguang wrote: > Trond, > > On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote: > > The above change improves on the existing code, but doesn't solve the > > problem that write_inode() isn't a good match for COMMIT. We need to > > wait for all the unstable WRITE rpc calls to return before we can know > > whether or not a COMMIT is needed (some commercial servers never require > > commit, even if the client requested an unstable write). That was the > > other reason for the change. > > Ah good to know that reason. However we cannot wait for ongoing WRITEs > for unlimited time or pages, otherwise nr_unstable goes up and squeeze > nr_dirty and nr_writeback to zero, and stall the cp process for a long > time, as demonstrated by the trace (more reasoning in previous email). OK. I think we need a mechanism to allow balance_dirty_pages() to communicate to the filesystem that it really is holding too many unstable pages. Currently, all we do is say that 'your total is too big', and then let the filesystem figure out what it needs to do. So how about if we modify your heuristic to do something like this? It applies on top of the previous patch. Cheers Trond --------------------------------------------------------------------------------------------------------- VM/NFS: The VM must tell the filesystem when to free reclaimable pages From: Trond Myklebust <Trond.Myklebust@netapp.com> balance_dirty_pages() should really tell the filesystem whether or not it has an excess of actual dirty pages, or whether it would be more useful to start freeing up the reclaimable pages. Assume that if the number of dirty pages associated with this backing-dev is less than 1/2 the number of reclaimable pages, then we should concentrate on freeing up the latter. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 9 +++++++-- include/linux/backing-dev.h | 6 ++++++ mm/page-writeback.c | 7 +++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 910be28..36113e6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1420,8 +1420,10 @@ int nfs_commit_unstable_pages(struct address_space *mapping, if (wbc->sync_mode != WB_SYNC_ALL && radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, NFS_PAGE_TAG_LOCKED)) { - mark_inode_unstable_pages(inode); - return 0; + if (wbc->bdi == NULL) + goto out_nocommit; + if (wbc->bdi->dirty_exceeded != BDI_RECLAIMABLE_EXCEEDED) + goto out_nocommit; } if (wbc->nonblocking) flags = 0; @@ -1429,6 +1431,9 @@ int nfs_commit_unstable_pages(struct address_space *mapping, if (ret > 0) ret = 0; return ret; +out_nocommit: + mark_inode_unstable_pages(inode); + return 0; } #else diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26a..cd1645e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -94,6 +94,12 @@ struct backing_dev_info { #endif }; +enum bdi_dirty_exceeded_state { + BDI_NO_DIRTY_EXCESS = 0, + BDI_DIRTY_EXCEEDED, + BDI_RECLAIMABLE_EXCEEDED, +}; + int bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..0133c8f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -524,8 +524,11 @@ static void balance_dirty_pages(struct address_space *mapping, (background_thresh + dirty_thresh) / 2) break; - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; + if (bdi_nr_writeback > bdi_nr_reclaimable / 2) { + if (bdi->dirty_exceeded != BDI_DIRTY_EXCEEDED) + bdi->dirty_exceeded = BDI_DIRTY_EXCEEDED; + } else if (bdi->dirty_exceeded != BDI_RECLAIMABLE_EXCEEDED) + bdi->dirty_exceeded = BDI_RECLAIMABLE_EXCEEDED; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 16:56 ` Trond Myklebust @ 2010-01-06 18:26 ` Trond Myklebust 2010-01-06 18:37 ` Peter Zijlstra 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 18:26 UTC (permalink / raw) To: Wu Fengguang Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 11:56 -0500, Trond Myklebust wrote: > On Wed, 2010-01-06 at 11:03 +0800, Wu Fengguang wrote: > > Trond, > > > > On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote: > > > The above change improves on the existing code, but doesn't solve the > > > problem that write_inode() isn't a good match for COMMIT. We need to > > > wait for all the unstable WRITE rpc calls to return before we can know > > > whether or not a COMMIT is needed (some commercial servers never require > > > commit, even if the client requested an unstable write). That was the > > > other reason for the change. > > > > Ah good to know that reason. However we cannot wait for ongoing WRITEs > > for unlimited time or pages, otherwise nr_unstable goes up and squeeze > > nr_dirty and nr_writeback to zero, and stall the cp process for a long > > time, as demonstrated by the trace (more reasoning in previous email). > > OK. I think we need a mechanism to allow balance_dirty_pages() to > communicate to the filesystem that it really is holding too many > unstable pages. Currently, all we do is say that 'your total is too > big', and then let the filesystem figure out what it needs to do. > > So how about if we modify your heuristic to do something like this? It > applies on top of the previous patch. Gah! I misread the definitions of bdi_nr_reclaimable and bdi_nr_writeback. Please ignore the previous patch. OK. It looks as if the only key to finding out how many unstable writes we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't specifically target our own backing-dev. Also, on reflection, I think it might be more helpful to use the writeback control to signal when we want to force a commit. That makes it a more general mechanism. There is one thing that we might still want to do here. Currently we do not update wbc->nr_to_write inside nfs_commit_unstable_pages(), which again means that we don't update 'pages_written' if the only effect of the writeback_inodes_wbc() was to commit pages. Perhaps it might not be a bad idea to do this (but that should be in a separate patch)... Cheers Trond ------------------------------------------------------------------------------------- VM/NFS: The VM must tell the filesystem when to free reclaimable pages From: Trond Myklebust <Trond.Myklebust@netapp.com> balance_dirty_pages() should really tell the filesystem whether or not it has an excess of actual dirty pages, or whether it would be more useful to start freeing up the unstable writes. Assume that if the number of unstable writes is more than 1/2 the number of reclaimable pages, then we should force NFS to free up the former. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 2 +- include/linux/writeback.h | 5 +++++ mm/page-writeback.c | 9 ++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 910be28..ee3daf4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, /* Don't commit yet if this is a non-blocking flush and there are * outstanding writes for this mapping. */ - if (wbc->sync_mode != WB_SYNC_ALL && + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, NFS_PAGE_TAG_LOCKED)) { mark_inode_unstable_pages(inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903..3fd5c3e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -62,6 +62,11 @@ struct writeback_control { * so we use a single control to update them */ unsigned no_nrwrite_index_update:1; + /* + * The following is used by balance_dirty_pages() to + * force NFS to commit unstable pages. + */ + unsigned force_commit:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..ede5356 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping, { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; + long nr_unstable_nfs; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; @@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping, get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + nr_unstable_nfs; nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); @@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping, * up. */ if (bdi_nr_reclaimable > bdi_thresh) { + wbc.force_commit = 0; + /* Force NFS to also free up unstable writes. */ + if (nr_unstable_nfs > nr_reclaimable / 2) + wbc.force_commit = 1; + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 18:26 ` Trond Myklebust @ 2010-01-06 18:37 ` Peter Zijlstra 2010-01-06 18:52 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Peter Zijlstra @ 2010-01-06 18:37 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote: > OK. It looks as if the only key to finding out how many unstable writes > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't > specifically target our own backing-dev. Would be a simple matter of splitting BDI_UNSTABLE out from BDI_RECLAIMABLE, no? Something like --- fs/nfs/write.c | 6 +++--- include/linux/backing-dev.h | 3 ++- mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 16 ++++++++++------ mm/truncate.c | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..7ba56f8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } @@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE); return 1; } return 0; @@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_UNSTABLE); nfs_clear_page_tag_locked(req); } return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26a..1ef1e5c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,7 +36,8 @@ enum bdi_state { typedef int (congested_fn)(void *, int); enum bdi_stat_item { - BDI_RECLAIMABLE, + BDI_DIRTY, + DBI_UNSTABLE, BDI_WRITEBACK, NR_BDI_STAT_ITEMS }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca03..88f3655 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" + "BdiDirty: %8lu kB\n" + "BdiUnstable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" @@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "wb_list: %8u\n" "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)), + (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0..458387d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..b1d31be 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, else avail_dirty = 0; - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + avail_dirty += bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); @@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping, global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_stat_sum(bdi, DBI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } @@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } @@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); return 1; } return 0; diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..b0ce8fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); if (account_size) task_io_account_cancelled_write(account_size); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 18:37 ` Peter Zijlstra @ 2010-01-06 18:52 ` Trond Myklebust 2010-01-06 19:07 ` Peter Zijlstra 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 18:52 UTC (permalink / raw) To: Peter Zijlstra Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote: > > OK. It looks as if the only key to finding out how many unstable writes > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't > > specifically target our own backing-dev. > > Would be a simple matter of splitting BDI_UNSTABLE out from > BDI_RECLAIMABLE, no? > > Something like OK. How about if we also add in a bdi->capabilities flag to tell that we might have BDI_UNSTABLE? That would allow us to avoid the potentially expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs case? Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 18:52 ` Trond Myklebust @ 2010-01-06 19:07 ` Peter Zijlstra 2010-01-06 19:21 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Peter Zijlstra @ 2010-01-06 19:07 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote: > On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: > > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote: > > > OK. It looks as if the only key to finding out how many unstable writes > > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't > > > specifically target our own backing-dev. > > > > Would be a simple matter of splitting BDI_UNSTABLE out from > > BDI_RECLAIMABLE, no? > > > > Something like > > OK. How about if we also add in a bdi->capabilities flag to tell that we > might have BDI_UNSTABLE? That would allow us to avoid the potentially > expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs > case? The bdi_stat_sum() in the error limit is basically the only such expensive op, but I suspect we might hit that more than enough. So sure that sounds like a plan. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 19:07 ` Peter Zijlstra @ 2010-01-06 19:21 ` Trond Myklebust 2010-01-06 19:53 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 19:21 UTC (permalink / raw) To: Peter Zijlstra Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 20:07 +0100, Peter Zijlstra wrote: > On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote: > > On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: > > > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote: > > > > OK. It looks as if the only key to finding out how many unstable writes > > > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't > > > > specifically target our own backing-dev. > > > > > > Would be a simple matter of splitting BDI_UNSTABLE out from > > > BDI_RECLAIMABLE, no? > > > > > > Something like > > > > OK. How about if we also add in a bdi->capabilities flag to tell that we > > might have BDI_UNSTABLE? That would allow us to avoid the potentially > > expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs > > case? > > The bdi_stat_sum() in the error limit is basically the only such > expensive op, but I suspect we might hit that more than enough. So sure > that sounds like a plan. > This should apply on top of your patch.... Cheers Trond ------------------------------------------------------------------------------------------------ VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices From: Trond Myklebust <Trond.Myklebust@netapp.com> Speeds up the accounting in balance_dirty_pages() for non-nfs devices. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/client.c | 1 + include/linux/backing-dev.h | 6 ++++++ mm/page-writeback.c | 16 +++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713..d0b060a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 42c3e2a..8b45166 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 +#define BDI_CAP_ACCT_UNSTABLE 0x00000200 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) return bdi == &default_backing_dev_info; } +static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index aa26b0f..d90a0db 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_cap_account_unstable(bdi)) + avail_dirty += bdi_stat(bdi, BDI_UNSTABLE); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); } @@ -512,8 +513,9 @@ static void balance_dirty_pages(struct address_space *mapping, nr_unstable_nfs; nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -563,11 +565,15 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 19:21 ` Trond Myklebust @ 2010-01-06 19:53 ` Trond Myklebust 2010-01-06 20:09 ` Jan Kara 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 19:53 UTC (permalink / raw) To: Peter Zijlstra Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 14:21 -0500, Trond Myklebust wrote: > On Wed, 2010-01-06 at 20:07 +0100, Peter Zijlstra wrote: > > On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote: > > > On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: > > > > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote: > > > > > OK. It looks as if the only key to finding out how many unstable writes > > > > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't > > > > > specifically target our own backing-dev. > > > > > > > > Would be a simple matter of splitting BDI_UNSTABLE out from > > > > BDI_RECLAIMABLE, no? > > > > > > > > Something like > > > > > > OK. How about if we also add in a bdi->capabilities flag to tell that we > > > might have BDI_UNSTABLE? That would allow us to avoid the potentially > > > expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs > > > case? > > > > The bdi_stat_sum() in the error limit is basically the only such > > expensive op, but I suspect we might hit that more than enough. So sure > > that sounds like a plan. > > > > This should apply on top of your patch.... ...and finally, this should convert the previous NFS patch to use the per-bdi accounting. Cheers Trond -------------------------------------------------------------------------------------- VM: Use per-bdi unstable accounting to improve use of wbc->force_commit From: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- mm/page-writeback.c | 13 +++++++------ 1 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d90a0db..c537543 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping, { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long nr_unstable_nfs; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping, .nr_to_write = write_chunk, .range_cyclic = 1, }; + long bdi_nr_unstable = 0; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); - nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - nr_unstable_nfs; + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); - if (bdi_cap_account_unstable(bdi)) - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); + if (bdi_cap_account_unstable(bdi)) { + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable += bdi_nr_unstable; + } bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping, if (bdi_nr_reclaimable > bdi_thresh) { wbc.force_commit = 0; /* Force NFS to also free up unstable writes. */ - if (nr_unstable_nfs > nr_reclaimable / 2) + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) wbc.force_commit = 1; writeback_inodes_wbc(&wbc); ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 19:53 ` Trond Myklebust @ 2010-01-06 20:09 ` Jan Kara 2010-01-06 20:51 ` [PATCH 0/6] " Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2010-01-06 20:09 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed 06-01-10 14:53:14, Trond Myklebust wrote: > ...and finally, this should convert the previous NFS patch to use the > per-bdi accounting. > > Cheers > Trond > > -------------------------------------------------------------------------------------- > VM: Use per-bdi unstable accounting to improve use of wbc->force_commit > > From: Trond Myklebust <Trond.Myklebust@netapp.com> > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> I like this. You can add Acked-by: Jan Kara <jack@suse.cz> to this patch previous patches adding unstable pages accounting. Honza > --- > > mm/page-writeback.c | 13 +++++++------ > 1 files changed, 7 insertions(+), 6 deletions(-) > > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index d90a0db..c537543 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping, > { > long nr_reclaimable, bdi_nr_reclaimable; > long nr_writeback, bdi_nr_writeback; > - long nr_unstable_nfs; > unsigned long background_thresh; > unsigned long dirty_thresh; > unsigned long bdi_thresh; > @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping, > .nr_to_write = write_chunk, > .range_cyclic = 1, > }; > + long bdi_nr_unstable = 0; > > get_dirty_limits(&background_thresh, &dirty_thresh, > &bdi_thresh, bdi); > > - nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); > nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > - nr_unstable_nfs; > + global_page_state(NR_UNSTABLE_NFS); > nr_writeback = global_page_state(NR_WRITEBACK); > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); > - if (bdi_cap_account_unstable(bdi)) > - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); > + if (bdi_cap_account_unstable(bdi)) { > + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); > + bdi_nr_reclaimable += bdi_nr_unstable; > + } > bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); > > if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) > @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping, > if (bdi_nr_reclaimable > bdi_thresh) { > wbc.force_commit = 0; > /* Force NFS to also free up unstable writes. */ > - if (nr_unstable_nfs > nr_reclaimable / 2) > + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) > wbc.force_commit = 1; > > writeback_inodes_wbc(&wbc); > -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 20:09 ` Jan Kara @ 2010-01-06 20:51 ` Trond Myklebust [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 8:16 ` Peter Zijlstra 0 siblings, 2 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org OK, here is the full series so far. I'm resending because I had to fix up a couple of BDI_UNSTABLE typos in Peter's patch... Cheers Trond --- Peter Zijlstra (1): VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust (5): NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set VM: Use per-bdi unstable accounting to improve use of wbc->force_commit VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices VM/NFS: The VM must tell the filesystem when to free reclaimable pages VFS: Ensure that writeback_single_inode() commits unstable writes fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/client.c | 1 + fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 39 +++++++++++++++++++++++++++++++++++---- include/linux/backing-dev.h | 9 ++++++++- include/linux/fs.h | 9 +++++++++ include/linux/writeback.h | 5 +++++ mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 30 ++++++++++++++++++++++++------ mm/truncate.c | 2 +- 14 files changed, 121 insertions(+), 35 deletions(-) ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-06 20:51 ` Trond Myklebust [not found] ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-06 20:51 ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust ` (5 subsequent siblings) 6 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- mm/page-writeback.c | 13 +++++++------ 1 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d90a0db..c537543 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping, { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long nr_unstable_nfs; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping, .nr_to_write = write_chunk, .range_cyclic = 1, }; + long bdi_nr_unstable = 0; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); - nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - nr_unstable_nfs; + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); - if (bdi_cap_account_unstable(bdi)) - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); + if (bdi_cap_account_unstable(bdi)) { + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable += bdi_nr_unstable; + } bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping, if (bdi_nr_reclaimable > bdi_thresh) { wbc.force_commit = 0; /* Force NFS to also free up unstable writes. */ - if (nr_unstable_nfs > nr_reclaimable / 2) + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) wbc.force_commit = 1; writeback_inodes_wbc(&wbc); ^ permalink raw reply related [flat|nested] 96+ messages in thread
[parent not found: <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit [not found] ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 2:34 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 2:34 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Trond, I'm with Jan that this patch can folded :) Thanks, Fengguang On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > Acked-by: Jan Kara <jack@suse.cz> > --- > > mm/page-writeback.c | 13 +++++++------ > 1 files changed, 7 insertions(+), 6 deletions(-) > > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index d90a0db..c537543 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping, > { > long nr_reclaimable, bdi_nr_reclaimable; > long nr_writeback, bdi_nr_writeback; > - long nr_unstable_nfs; > unsigned long background_thresh; > unsigned long dirty_thresh; > unsigned long bdi_thresh; > @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping, > .nr_to_write = write_chunk, > .range_cyclic = 1, > }; > + long bdi_nr_unstable = 0; > > get_dirty_limits(&background_thresh, &dirty_thresh, > &bdi_thresh, bdi); > > - nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); > nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > - nr_unstable_nfs; > + global_page_state(NR_UNSTABLE_NFS); > nr_writeback = global_page_state(NR_WRITEBACK); > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); > - if (bdi_cap_account_unstable(bdi)) > - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); > + if (bdi_cap_account_unstable(bdi)) { > + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); > + bdi_nr_reclaimable += bdi_nr_unstable; > + } > bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); > > if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) > @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping, > if (bdi_nr_reclaimable > bdi_thresh) { > wbc.force_commit = 0; > /* Force NFS to also free up unstable writes. */ > - if (nr_unstable_nfs > nr_reclaimable / 2) > + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) > wbc.force_commit = 1; > > writeback_inodes_wbc(&wbc); > ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-06 20:51 ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust @ 2010-01-06 20:51 ` Trond Myklebust 2010-01-07 2:29 ` Wu Fengguang 2010-01-06 20:51 ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust ` (4 subsequent siblings) 6 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org balance_dirty_pages() should really tell the filesystem whether or not it has an excess of actual dirty pages, or whether it would be more useful to start freeing up the unstable writes. Assume that if the number of unstable writes is more than 1/2 the number of reclaimable pages, then we should force NFS to free up the former. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/write.c | 2 +- include/linux/writeback.h | 5 +++++ mm/page-writeback.c | 9 ++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 910be28..ee3daf4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, /* Don't commit yet if this is a non-blocking flush and there are * outstanding writes for this mapping. */ - if (wbc->sync_mode != WB_SYNC_ALL && + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, NFS_PAGE_TAG_LOCKED)) { mark_inode_unstable_pages(inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903..3fd5c3e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -62,6 +62,11 @@ struct writeback_control { * so we use a single control to update them */ unsigned no_nrwrite_index_update:1; + /* + * The following is used by balance_dirty_pages() to + * force NFS to commit unstable pages. + */ + unsigned force_commit:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..ede5356 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping, { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; + long nr_unstable_nfs; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; @@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping, get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + nr_unstable_nfs; nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); @@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping, * up. */ if (bdi_nr_reclaimable > bdi_thresh) { + wbc.force_commit = 0; + /* Force NFS to also free up unstable writes. */ + if (nr_unstable_nfs > nr_reclaimable / 2) + wbc.force_commit = 1; + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages 2010-01-06 20:51 ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust @ 2010-01-07 2:29 ` Wu Fengguang 2010-01-07 4:49 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 2:29 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > balance_dirty_pages() should really tell the filesystem whether or not it > has an excess of actual dirty pages, or whether it would be more useful to > start freeing up the unstable writes. > > Assume that if the number of unstable writes is more than 1/2 the number of > reclaimable pages, then we should force NFS to free up the former. > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > Acked-by: Jan Kara <jack@suse.cz> > --- > > fs/nfs/write.c | 2 +- > include/linux/writeback.h | 5 +++++ > mm/page-writeback.c | 9 ++++++++- > 3 files changed, 14 insertions(+), 2 deletions(-) > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > index 910be28..ee3daf4 100644 > --- a/fs/nfs/write.c > +++ b/fs/nfs/write.c > @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, > /* Don't commit yet if this is a non-blocking flush and there are > * outstanding writes for this mapping. > */ > - if (wbc->sync_mode != WB_SYNC_ALL && > + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && > radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > NFS_PAGE_TAG_LOCKED)) { > mark_inode_unstable_pages(inode); > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > index 76e8903..3fd5c3e 100644 > --- a/include/linux/writeback.h > +++ b/include/linux/writeback.h > @@ -62,6 +62,11 @@ struct writeback_control { > * so we use a single control to update them > */ > unsigned no_nrwrite_index_update:1; > + /* > + * The following is used by balance_dirty_pages() to > + * force NFS to commit unstable pages. > + */ In fact it may be too late to force commit at balance_dirty_pages() time: commit takes time and the application has already been blocked. If not convenient for now, I can make the change -- I'll remove the writeback_inodes_wbc() call altogether from balance_dirty_pages(). > + unsigned force_commit:1; > }; nfs_commit may be a more newbie friendly name? Thanks, Fengguang > /* > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index 0b19943..ede5356 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping, > { > long nr_reclaimable, bdi_nr_reclaimable; > long nr_writeback, bdi_nr_writeback; > + long nr_unstable_nfs; > unsigned long background_thresh; > unsigned long dirty_thresh; > unsigned long bdi_thresh; > @@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping, > get_dirty_limits(&background_thresh, &dirty_thresh, > &bdi_thresh, bdi); > > + nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); > nr_reclaimable = global_page_state(NR_FILE_DIRTY) + > - global_page_state(NR_UNSTABLE_NFS); > + nr_unstable_nfs; > nr_writeback = global_page_state(NR_WRITEBACK); > > bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); > @@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping, > * up. > */ > if (bdi_nr_reclaimable > bdi_thresh) { > + wbc.force_commit = 0; > + /* Force NFS to also free up unstable writes. */ > + if (nr_unstable_nfs > nr_reclaimable / 2) > + wbc.force_commit = 1; > + > writeback_inodes_wbc(&wbc); > pages_written += write_chunk - wbc.nr_to_write; > get_dirty_limits(&background_thresh, &dirty_thresh, > ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages 2010-01-07 2:29 ` Wu Fengguang @ 2010-01-07 4:49 ` Trond Myklebust 2010-01-07 5:03 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:49 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2010-01-07 at 10:29 +0800, Wu Fengguang wrote: > On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > > balance_dirty_pages() should really tell the filesystem whether or not it > > has an excess of actual dirty pages, or whether it would be more useful to > > start freeing up the unstable writes. > > > > Assume that if the number of unstable writes is more than 1/2 the number of > > reclaimable pages, then we should force NFS to free up the former. > > > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > > Acked-by: Jan Kara <jack@suse.cz> > > --- > > > > fs/nfs/write.c | 2 +- > > include/linux/writeback.h | 5 +++++ > > mm/page-writeback.c | 9 ++++++++- > > 3 files changed, 14 insertions(+), 2 deletions(-) > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > index 910be28..ee3daf4 100644 > > --- a/fs/nfs/write.c > > +++ b/fs/nfs/write.c > > @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, > > /* Don't commit yet if this is a non-blocking flush and there are > > * outstanding writes for this mapping. > > */ > > - if (wbc->sync_mode != WB_SYNC_ALL && > > + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && > > radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > > NFS_PAGE_TAG_LOCKED)) { > > mark_inode_unstable_pages(inode); > > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > > index 76e8903..3fd5c3e 100644 > > --- a/include/linux/writeback.h > > +++ b/include/linux/writeback.h > > @@ -62,6 +62,11 @@ struct writeback_control { > > * so we use a single control to update them > > */ > > unsigned no_nrwrite_index_update:1; > > + /* > > + * The following is used by balance_dirty_pages() to > > + * force NFS to commit unstable pages. > > + */ > > In fact it may be too late to force commit at balance_dirty_pages() > time: commit takes time and the application has already been blocked. > > If not convenient for now, I can make the change -- I'll remove the > writeback_inodes_wbc() call altogether from balance_dirty_pages(). You could always set the 'for_background' flag instead. > > + unsigned force_commit:1; > > }; > > nfs_commit may be a more newbie friendly name? We could possibly rename it to something like 'force_nfs_commit', but the comment above the declaration should really be sufficient. Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages 2010-01-07 4:49 ` Trond Myklebust @ 2010-01-07 5:03 ` Wu Fengguang 2010-01-07 5:30 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 5:03 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 12:49:23PM +0800, Trond Myklebust wrote: > On Thu, 2010-01-07 at 10:29 +0800, Wu Fengguang wrote: > > On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > > > balance_dirty_pages() should really tell the filesystem whether or not it > > > has an excess of actual dirty pages, or whether it would be more useful to > > > start freeing up the unstable writes. > > > > > > Assume that if the number of unstable writes is more than 1/2 the number of > > > reclaimable pages, then we should force NFS to free up the former. > > > > > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > > > Acked-by: Jan Kara <jack@suse.cz> > > > --- > > > > > > fs/nfs/write.c | 2 +- > > > include/linux/writeback.h | 5 +++++ > > > mm/page-writeback.c | 9 ++++++++- > > > 3 files changed, 14 insertions(+), 2 deletions(-) > > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > index 910be28..ee3daf4 100644 > > > --- a/fs/nfs/write.c > > > +++ b/fs/nfs/write.c > > > @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, > > > /* Don't commit yet if this is a non-blocking flush and there are > > > * outstanding writes for this mapping. > > > */ > > > - if (wbc->sync_mode != WB_SYNC_ALL && > > > + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && > > > radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > > > NFS_PAGE_TAG_LOCKED)) { > > > mark_inode_unstable_pages(inode); > > > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > > > index 76e8903..3fd5c3e 100644 > > > --- a/include/linux/writeback.h > > > +++ b/include/linux/writeback.h > > > @@ -62,6 +62,11 @@ struct writeback_control { > > > * so we use a single control to update them > > > */ > > > unsigned no_nrwrite_index_update:1; > > > + /* > > > + * The following is used by balance_dirty_pages() to > > > + * force NFS to commit unstable pages. > > > + */ > > > > In fact it may be too late to force commit at balance_dirty_pages() > > time: commit takes time and the application has already been blocked. > > > > If not convenient for now, I can make the change -- I'll remove the > > writeback_inodes_wbc() call altogether from balance_dirty_pages(). > > You could always set the 'for_background' flag instead. Please this is misusing ->for_background.. Anyway it's not a big problem. I'll set the force_nfs_commit flag in background writeback. > > > + unsigned force_commit:1; > > > }; > > > > nfs_commit may be a more newbie friendly name? > > We could possibly rename it to something like 'force_nfs_commit', but > the comment above the declaration should really be sufficient. "commit" could also be misread as "commit a transaction"? Anyway I think adding an "nfs" limits the scope to NFS thus makes code reading somehow easier. Just a personal feeling. Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages 2010-01-07 5:03 ` Wu Fengguang @ 2010-01-07 5:30 ` Trond Myklebust 2010-01-07 14:37 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 5:30 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2010-01-07 at 13:03 +0800, Wu Fengguang wrote: > "commit" could also be misread as "commit a transaction"? > Anyway I think adding an "nfs" limits the scope to NFS thus makes code > reading somehow easier. Just a personal feeling. How about 'force_commit_unstable' instead? That ties it up to the unstable writes rather than NFS. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages 2010-01-07 5:30 ` Trond Myklebust @ 2010-01-07 14:37 ` Wu Fengguang 2010-01-07 14:41 ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 14:37 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 01:30:43PM +0800, Trond Myklebust wrote: > On Thu, 2010-01-07 at 13:03 +0800, Wu Fengguang wrote: > > > "commit" could also be misread as "commit a transaction"? > > Anyway I think adding an "nfs" limits the scope to NFS thus makes code > > reading somehow easier. Just a personal feeling. > > How about 'force_commit_unstable' instead? That ties it up to the > unstable writes rather than NFS. That would be good, thanks! Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-07 14:37 ` Wu Fengguang @ 2010-01-07 14:41 ` Trond Myklebust [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Take 3. Incorporate Fengguang's suggestion that we change the name of the force_commit flag. Cheers Trond --- Peter Zijlstra (1): VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust (4): NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set VM/NFS: The VM must tell the filesystem when to free reclaimable pages VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices VFS: Ensure that writeback_single_inode() commits unstable writes fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/client.c | 1 + fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 39 +++++++++++++++++++++++++++++++++++---- include/linux/backing-dev.h | 9 ++++++++- include/linux/fs.h | 9 +++++++++ include/linux/writeback.h | 5 +++++ mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 30 ++++++++++++++++++++++++------ mm/truncate.c | 2 +- 14 files changed, 121 insertions(+), 35 deletions(-) ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 14:41 ` Trond Myklebust 2010-01-07 14:41 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust ` (3 subsequent siblings) 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Wu Fengguang <fengguang.wu@intel.com> --- fs/nfs/write.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 43e47b4..7f1f2aa 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, mark_inode_unstable_pages(inode); return 0; } - if (wbc->nonblocking) + if (wbc->nonblocking || wbc->for_background) flags = 0; ret = nfs_commit_inode(inode, flags); if (ret > 0) ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 14:41 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust @ 2010-01-07 14:41 ` Trond Myklebust 2010-01-07 14:41 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust ` (2 subsequent siblings) 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Speeds up the accounting in balance_dirty_pages() for non-nfs devices. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> --- fs/nfs/client.c | 1 + include/linux/backing-dev.h | 6 ++++++ mm/page-writeback.c | 16 +++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713..d0b060a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 42c3e2a..8b45166 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 +#define BDI_CAP_ACCT_UNSTABLE 0x00000200 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) return bdi == &default_backing_dev_info; } +static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 23d3fc6..c06739b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_cap_account_unstable(bdi)) + avail_dirty += bdi_stat(bdi, BDI_UNSTABLE); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); } @@ -510,8 +511,9 @@ static void balance_dirty_pages(struct address_space *mapping, global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -556,11 +558,15 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 14:41 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust 2010-01-07 14:41 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust @ 2010-01-07 14:41 ` Trond Myklebust 2010-01-07 14:41 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust 2010-01-07 14:41 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org If the call to do_writepages() succeeded in starting writeback, we do not know whether or not we will need to COMMIT any unstable writes until after the write RPC calls are finished. Currently, we assume that at least one write RPC call will have finished, and set I_DIRTY_DATASYNC by the time do_writepages is done, so that write_inode() is triggered. In order to ensure reliable operation (i.e. ensure that a single call to writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages are on disk) we need to first wait for filemap_fdatawait() to complete, then test for unstable pages. Since NFS is currently the only filesystem that has unstable pages, we can add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, this will trigger a callback to a new address_space_operation to call the COMMIT. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <peterz@infradead.org> --- fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- include/linux/fs.h | 9 +++++++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c..3640769 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static int commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) + return mapping->a_ops->commit_unstable_pages(mapping, wbc); + return 0; +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(mapping, wbc); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -533,6 +556,12 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } + } else if (inode->i_state & I_UNSTABLE_PAGES) { + /* + * The inode has got yet more unstable pages to + * commit. Requeue... + */ + redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..67e50ac 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, + .commit_unstable_pages = nfs_commit_unstable_pages, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..8341709 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d..7bb326f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..805c1a0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..910be28 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int flags = FLUSH_SYNC; + int ret; + + /* Don't commit yet if this is a non-blocking flush and there are + * outstanding writes for this mapping. + */ + if (wbc->sync_mode != WB_SYNC_ALL && + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, + NFS_PAGE_TAG_LOCKED)) { + mark_inode_unstable_pages(inode); + return 0; + } + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) + ret = 0; + return ret; +} + #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca8..de594b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -602,6 +602,8 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*commit_unstable_pages)(struct address_space *, + struct writeback_control *); }; /* @@ -1635,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 8 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (2 preceding siblings ...) 2010-01-07 14:41 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust @ 2010-01-07 14:41 ` Trond Myklebust 2010-01-07 14:41 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org From: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> --- fs/nfs/write.c | 6 +++--- include/linux/backing-dev.h | 3 ++- mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 16 ++++++++++------ mm/truncate.c | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 910be28..36549b1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE); mark_inode_unstable_pages(inode); } @@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE); return 1; } return 0; @@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_UNSTABLE); nfs_clear_page_tag_locked(req); } return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26a..42c3e2a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,7 +36,8 @@ enum bdi_state { typedef int (congested_fn)(void *, int); enum bdi_stat_item { - BDI_RECLAIMABLE, + BDI_DIRTY, + BDI_UNSTABLE, BDI_WRITEBACK, NR_BDI_STAT_ITEMS }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca03..88f3655 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" + "BdiDirty: %8lu kB\n" + "BdiUnstable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" @@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "wb_list: %8u\n" "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)), + (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0..458387d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..23d3fc6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, else avail_dirty = 0; - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + avail_dirty += bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); @@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping, global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } @@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } @@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); return 1; } return 0; diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..b0ce8fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); if (account_size) task_io_account_cancelled_write(account_size); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages [not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (3 preceding siblings ...) 2010-01-07 14:41 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust @ 2010-01-07 14:41 ` Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org balance_dirty_pages() should really tell the filesystem whether or not it has an excess of actual dirty pages, or whether it would be more useful to start freeing up the unstable writes. Assume that if the number of unstable writes is more than 1/2 the number of reclaimable pages, then we should force NFS to free up the former. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <peterz@infradead.org> --- fs/nfs/write.c | 2 +- include/linux/writeback.h | 5 +++++ mm/page-writeback.c | 12 ++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 36549b1..43e47b4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, /* Don't commit yet if this is a non-blocking flush and there are * outstanding writes for this mapping. */ - if (wbc->sync_mode != WB_SYNC_ALL && + if (!wbc->force_commit_unstable && wbc->sync_mode != WB_SYNC_ALL && radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, NFS_PAGE_TAG_LOCKED)) { mark_inode_unstable_pages(inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903..8229139 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -62,6 +62,11 @@ struct writeback_control { * so we use a single control to update them */ unsigned no_nrwrite_index_update:1; + /* + * The following is used by balance_dirty_pages() to + * force NFS to commit unstable pages. + */ + unsigned force_commit_unstable:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c06739b..6a0aec7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -503,6 +503,7 @@ static void balance_dirty_pages(struct address_space *mapping, .nr_to_write = write_chunk, .range_cyclic = 1, }; + long bdi_nr_unstable = 0; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -512,8 +513,10 @@ static void balance_dirty_pages(struct address_space *mapping, nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); - if (bdi_cap_account_unstable(bdi)) - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); + if (bdi_cap_account_unstable(bdi)) { + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable += bdi_nr_unstable; + } bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -541,6 +544,11 @@ static void balance_dirty_pages(struct address_space *mapping, * up. */ if (bdi_nr_reclaimable > bdi_thresh) { + wbc.force_commit_unstable = 0; + /* Force NFS to also free up unstable writes. */ + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) + wbc.force_commit_unstable = 1; + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-06 20:51 ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust 2010-01-06 20:51 ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust @ 2010-01-06 20:51 ` Trond Myklebust [not found] ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 2:18 ` Wu Fengguang 2010-01-06 20:51 ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust ` (3 subsequent siblings) 6 siblings, 2 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org If the call to do_writepages() succeeded in starting writeback, we do not know whether or not we will need to COMMIT any unstable writes until after the write RPC calls are finished. Currently, we assume that at least one write RPC call will have finished, and set I_DIRTY_DATASYNC by the time do_writepages is done, so that write_inode() is triggered. In order to ensure reliable operation (i.e. ensure that a single call to writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages are on disk) we need to first wait for filemap_fdatawait() to complete, then test for unstable pages. Since NFS is currently the only filesystem that has unstable pages, we can add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, this will trigger a callback to a new address_space_operation to call the COMMIT. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- include/linux/fs.h | 9 +++++++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c..3bc0a96 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static int commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) + return mapping->a_ops->commit_unstable_pages(mapping, wbc); + return 0; +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(mapping, wbc); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -533,6 +556,12 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } + } else if (inode->i_state & I_UNSTABLE_PAGES) { + /* + * The inode has got yet more unstable pages to + * commit. Requeue on b_more_io + */ + requeue_io(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..67e50ac 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, + .commit_unstable_pages = nfs_commit_unstable_pages, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..8341709 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d..7bb326f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..805c1a0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..910be28 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int flags = FLUSH_SYNC; + int ret; + + /* Don't commit yet if this is a non-blocking flush and there are + * outstanding writes for this mapping. + */ + if (wbc->sync_mode != WB_SYNC_ALL && + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, + NFS_PAGE_TAG_LOCKED)) { + mark_inode_unstable_pages(inode); + return 0; + } + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) + ret = 0; + return ret; +} + #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca8..ea0b7a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -602,6 +602,8 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*commit_unstable_pages)(struct address_space *, + struct writeback_control *); }; /* @@ -1635,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 9 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
[parent not found: <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-06 21:38 ` Jan Kara [not found] ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2010-01-06 21:38 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed 06-01-10 15:51:10, Trond Myklebust wrote: ... > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 9147ca8..ea0b7a3 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1635,6 +1637,8 @@ struct super_operations { > #define I_CLEAR 64 > #define __I_SYNC 7 > #define I_SYNC (1 << __I_SYNC) > +#define __I_UNSTABLE_PAGES 9 Hum, why isn't this 8? Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> @ 2010-01-06 21:48 ` Trond Myklebust 0 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 21:48 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Peter Zijlstra, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 22:38 +0100, Jan Kara wrote: > On Wed 06-01-10 15:51:10, Trond Myklebust wrote: > ... > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > index 9147ca8..ea0b7a3 100644 > > --- a/include/linux/fs.h > > +++ b/include/linux/fs.h > > @@ -1635,6 +1637,8 @@ struct super_operations { > > #define I_CLEAR 64 > > #define __I_SYNC 7 > > #define I_SYNC (1 << __I_SYNC) > > +#define __I_UNSTABLE_PAGES 9 > Hum, why isn't this 8? > > Honza I missed Christoph's patch that got rid of I_LOCK. I think that was merged after I started work on these patches. I'd be quite OK with changing the above value to 8 if that is preferable. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-06 20:51 ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust [not found] ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 2:18 ` Wu Fengguang [not found] ` <1262839082.2185.15.camel@localhost> 1 sibling, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 2:18 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > If the call to do_writepages() succeeded in starting writeback, we do not > know whether or not we will need to COMMIT any unstable writes until after > the write RPC calls are finished. Currently, we assume that at least one > write RPC call will have finished, and set I_DIRTY_DATASYNC by the time > do_writepages is done, so that write_inode() is triggered. > > In order to ensure reliable operation (i.e. ensure that a single call to > writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages > are on disk) we need to first wait for filemap_fdatawait() to complete, > then test for unstable pages. > > Since NFS is currently the only filesystem that has unstable pages, we can > add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, > this will trigger a callback to a new address_space_operation to call the > COMMIT. > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> > Acked-by: Jan Kara <jack@suse.cz> > --- > > fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- > fs/nfs/file.c | 1 + > fs/nfs/inode.c | 16 ---------------- > fs/nfs/internal.h | 3 ++- > fs/nfs/super.c | 2 -- > fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- > include/linux/fs.h | 9 +++++++++ > 7 files changed, 74 insertions(+), 21 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index 1a7c42c..3bc0a96 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync) > } > > /* > + * Commit the NFS unstable pages. > + */ > +static int commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) > + return mapping->a_ops->commit_unstable_pages(mapping, wbc); > + return 0; > +} > + > +/* > * Wait for writeback on an inode to complete. > */ > static void inode_wait_for_writeback(struct inode *inode) > @@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) > } > > spin_lock(&inode_lock); > + /* > + * Special state for cleaning NFS unstable pages > + */ > + if (inode->i_state & I_UNSTABLE_PAGES) { > + int err; > + inode->i_state &= ~I_UNSTABLE_PAGES; > + spin_unlock(&inode_lock); > + err = commit_unstable_pages(mapping, wbc); > + if (ret == 0) > + ret = err; > + spin_lock(&inode_lock); > + } > inode->i_state &= ~I_SYNC; > if (!(inode->i_state & (I_FREEING | I_CLEAR))) { > if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { > @@ -533,6 +556,12 @@ select_queue: > inode->i_state |= I_DIRTY_PAGES; > redirty_tail(inode); > } > + } else if (inode->i_state & I_UNSTABLE_PAGES) { > + /* > + * The inode has got yet more unstable pages to > + * commit. Requeue on b_more_io > + */ > + requeue_io(inode); This risks "busy retrying" inodes with unstable pages, when - nfs_commit_unstable_pages() don't think it's time to commit - NFS server somehow response slowly The workaround is to use redirty_tail() for now. But that risks delay the COMMIT for up to 30s, which obviously might stuck applications in balance_dirty_pages() for too long. I have a patch to shorten the retry time to 1s (or other constant) by introducing b_more_io_wait. It currently sits in my writeback queue series whose main blocking issue is the constantly broken NFS pipeline.. > } else if (atomic_read(&inode->i_count)) { > /* > * The inode is clean, inuse > @@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) > > spin_lock(&inode_lock); > if ((inode->i_state & flags) != flags) { > - const int was_dirty = inode->i_state & I_DIRTY; > + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); > > inode->i_state |= flags; > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c > index 6b89132..67e50ac 100644 > --- a/fs/nfs/file.c > +++ b/fs/nfs/file.c > @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { > .migratepage = nfs_migrate_page, > .launder_page = nfs_launder_page, > .error_remove_page = generic_error_remove_page, > + .commit_unstable_pages = nfs_commit_unstable_pages, > }; > > /* > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index faa0918..8341709 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) > return ino; > } > > -int nfs_write_inode(struct inode *inode, int sync) > -{ > - int ret; > - > - if (sync) { > - ret = filemap_fdatawait(inode->i_mapping); > - if (ret == 0) > - ret = nfs_commit_inode(inode, FLUSH_SYNC); > - } else > - ret = nfs_commit_inode(inode, 0); > - if (ret >= 0) > - return 0; > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > - return ret; > -} > - > void nfs_clear_inode(struct inode *inode) > { > /* > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h > index 29e464d..7bb326f 100644 > --- a/fs/nfs/internal.h > +++ b/fs/nfs/internal.h > @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); > extern struct workqueue_struct *nfsiod_workqueue; > extern struct inode *nfs_alloc_inode(struct super_block *sb); > extern void nfs_destroy_inode(struct inode *); > -extern int nfs_write_inode(struct inode *,int); > extern void nfs_clear_inode(struct inode *); > #ifdef CONFIG_NFS_V4 > extern void nfs4_clear_inode(struct inode *); > @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, > extern void nfs_read_prepare(struct rpc_task *task, void *calldata); > > /* write.c */ > +extern int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc); > extern void nfs_write_prepare(struct rpc_task *task, void *calldata); > #ifdef CONFIG_MIGRATION > extern int nfs_migrate_page(struct address_space *, > diff --git a/fs/nfs/super.c b/fs/nfs/super.c > index ce907ef..805c1a0 100644 > --- a/fs/nfs/super.c > +++ b/fs/nfs/super.c > @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { > static const struct super_operations nfs_sops = { > .alloc_inode = nfs_alloc_inode, > .destroy_inode = nfs_destroy_inode, > - .write_inode = nfs_write_inode, > .statfs = nfs_statfs, > .clear_inode = nfs_clear_inode, > .umount_begin = nfs_umount_begin, > @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { > static const struct super_operations nfs4_sops = { > .alloc_inode = nfs_alloc_inode, > .destroy_inode = nfs_destroy_inode, > - .write_inode = nfs_write_inode, > .statfs = nfs_statfs, > .clear_inode = nfs4_clear_inode, > .umount_begin = nfs_umount_begin, > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > index d171696..910be28 100644 > --- a/fs/nfs/write.c > +++ b/fs/nfs/write.c > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > spin_unlock(&inode->i_lock); > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > + mark_inode_unstable_pages(inode); Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > } > > static int > @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) > } > return res; > } > + > +int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + struct inode *inode = mapping->host; > + int flags = FLUSH_SYNC; > + int ret; > + > + /* Don't commit yet if this is a non-blocking flush and there are > + * outstanding writes for this mapping. > + */ > + if (wbc->sync_mode != WB_SYNC_ALL && > + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > + NFS_PAGE_TAG_LOCKED)) { > + mark_inode_unstable_pages(inode); > + return 0; > + } A dumb question: does NFS_PAGE_TAG_LOCKED means either flying COMMITs or WRITEs? As an NFS newbie, I'm only confident on the COMMIT part :) > + if (wbc->nonblocking) > + flags = 0; > + ret = nfs_commit_inode(inode, flags); > + if (ret > 0) > + ret = 0; > + return ret; > +} > + > #else > static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) > { > return 0; > } > + > +int nfs_commit_unstable_pages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + return 0; > +} > #endif > > long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 9147ca8..ea0b7a3 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -602,6 +602,8 @@ struct address_space_operations { > int (*is_partially_uptodate) (struct page *, read_descriptor_t *, > unsigned long); > int (*error_remove_page)(struct address_space *, struct page *); > + int (*commit_unstable_pages)(struct address_space *, > + struct writeback_control *); > }; > > /* > @@ -1635,6 +1637,8 @@ struct super_operations { > #define I_CLEAR 64 > #define __I_SYNC 7 > #define I_SYNC (1 << __I_SYNC) > +#define __I_UNSTABLE_PAGES 9 > +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) > > #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) > > @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) > __mark_inode_dirty(inode, I_DIRTY_SYNC); > } > > +static inline void mark_inode_unstable_pages(struct inode *inode) > +{ > + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); > +} > + > /** > * inc_nlink - directly increment an inode's link count > * @inode: inode > ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <1262839082.2185.15.camel@localhost>]
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <1262839082.2185.15.camel@localhost> @ 2010-01-07 4:48 ` Wu Fengguang 2010-01-07 4:53 ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust 2010-01-07 14:56 ` [PATCH 1/6] " Wu Fengguang 1 sibling, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 4:48 UTC (permalink / raw) To: Myklebust, Trond Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > On Thu, 2010-01-07 at 10:18 +0800, Wu Fengguang wrote: > > On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > > > @@ -533,6 +556,12 @@ select_queue: > > > inode->i_state |= I_DIRTY_PAGES; > > > redirty_tail(inode); > > > } > > > + } else if (inode->i_state & I_UNSTABLE_PAGES) { > > > + /* > > > + * The inode has got yet more unstable pages to > > > + * commit. Requeue on b_more_io > > > + */ > > > + requeue_io(inode); > > > > This risks "busy retrying" inodes with unstable pages, when > > > > - nfs_commit_unstable_pages() don't think it's time to commit > > - NFS server somehow response slowly > > > > The workaround is to use redirty_tail() for now. But that risks delay > > the COMMIT for up to 30s, which obviously might stuck applications in > > balance_dirty_pages() for too long. > > > > I have a patch to shorten the retry time to 1s (or other constant) > > by introducing b_more_io_wait. It currently sits in my writeback queue > > series whose main blocking issue is the constantly broken NFS pipeline.. > > > OK. Should I use redirty_tail() for the moment then, and assume you will > fix when you introduce the new state? OK, I'll change your redirty_tail() then :) > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > index d171696..910be28 100644 > > > --- a/fs/nfs/write.c > > > +++ b/fs/nfs/write.c > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > spin_unlock(&inode->i_lock); > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > + mark_inode_unstable_pages(inode); > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > this patch is applied. We won't ever need it. > > If the VM or VFS is doing it, then they ought to be fixed: there is no > reason to assume that all filesystems need to sync their inodes on > i_size changes. Ah OK, I took it for certain.. > > > + /* Don't commit yet if this is a non-blocking flush and there are > > > + * outstanding writes for this mapping. > > > + */ > > > + if (wbc->sync_mode != WB_SYNC_ALL && > > > + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, > > > + NFS_PAGE_TAG_LOCKED)) { > > > + mark_inode_unstable_pages(inode); > > > + return 0; > > > + } > > > > A dumb question: does NFS_PAGE_TAG_LOCKED means either flying COMMITs > > or WRITEs? As an NFS newbie, I'm only confident on the COMMIT part :) > > Both writebacks and commits will cause NFS_PAGE_TAG_LOCKED to be set, as > will attempts to change the page contents. See the calls to > nfs_set_page_tag_locked()... Thanks for the tip! > IOW: the above code will fail to trigger if there are outstanding WRITE > RPC calls, or if there is a second process that happens to be writing to > this inode's page cache... IOW: for a busy "cp", the commit of inode may be delayed until "nr_unstable > nr_dirty / 2"? OK that's what we want. Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-07 4:48 ` Wu Fengguang @ 2010-01-07 4:53 ` Trond Myklebust [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Take 2 of this series, incorporating the suggested changes from Jan and Fengguang... Cheers Trond --- Peter Zijlstra (1): VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust (4): NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set VM/NFS: The VM must tell the filesystem when to free reclaimable pages VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices VFS: Ensure that writeback_single_inode() commits unstable writes fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/client.c | 1 + fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 39 +++++++++++++++++++++++++++++++++++---- include/linux/backing-dev.h | 9 ++++++++- include/linux/fs.h | 9 +++++++++ include/linux/writeback.h | 5 +++++ mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 30 ++++++++++++++++++++++++------ mm/truncate.c | 2 +- 14 files changed, 121 insertions(+), 35 deletions(-) -- Signature ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 4:53 ` Trond Myklebust 2010-01-07 4:53 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust ` (3 subsequent siblings) 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Speeds up the accounting in balance_dirty_pages() for non-nfs devices. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/client.c | 1 + include/linux/backing-dev.h | 6 ++++++ mm/page-writeback.c | 16 +++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713..d0b060a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 42c3e2a..8b45166 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 +#define BDI_CAP_ACCT_UNSTABLE 0x00000200 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) return bdi == &default_backing_dev_info; } +static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 23d3fc6..c06739b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_cap_account_unstable(bdi)) + avail_dirty += bdi_stat(bdi, BDI_UNSTABLE); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); } @@ -510,8 +511,9 @@ static void balance_dirty_pages(struct address_space *mapping, global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -556,11 +558,15 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 4:53 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust @ 2010-01-07 4:53 ` Trond Myklebust 2010-01-07 4:53 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust ` (2 subsequent siblings) 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org balance_dirty_pages() should really tell the filesystem whether or not it has an excess of actual dirty pages, or whether it would be more useful to start freeing up the unstable writes. Assume that if the number of unstable writes is more than 1/2 the number of reclaimable pages, then we should force NFS to free up the former. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/write.c | 2 +- include/linux/writeback.h | 5 +++++ mm/page-writeback.c | 12 ++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 36549b1..978de7f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, /* Don't commit yet if this is a non-blocking flush and there are * outstanding writes for this mapping. */ - if (wbc->sync_mode != WB_SYNC_ALL && + if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL && radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, NFS_PAGE_TAG_LOCKED)) { mark_inode_unstable_pages(inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903..3fd5c3e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -62,6 +62,11 @@ struct writeback_control { * so we use a single control to update them */ unsigned no_nrwrite_index_update:1; + /* + * The following is used by balance_dirty_pages() to + * force NFS to commit unstable pages. + */ + unsigned force_commit:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c06739b..c537543 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -503,6 +503,7 @@ static void balance_dirty_pages(struct address_space *mapping, .nr_to_write = write_chunk, .range_cyclic = 1, }; + long bdi_nr_unstable = 0; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -512,8 +513,10 @@ static void balance_dirty_pages(struct address_space *mapping, nr_writeback = global_page_state(NR_WRITEBACK); bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); - if (bdi_cap_account_unstable(bdi)) - bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); + if (bdi_cap_account_unstable(bdi)) { + bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable += bdi_nr_unstable; + } bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -541,6 +544,11 @@ static void balance_dirty_pages(struct address_space *mapping, * up. */ if (bdi_nr_reclaimable > bdi_thresh) { + wbc.force_commit = 0; + /* Force NFS to also free up unstable writes. */ + if (bdi_nr_unstable > bdi_nr_reclaimable / 2) + wbc.force_commit = 1; + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-07 4:53 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust 2010-01-07 4:53 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust @ 2010-01-07 4:53 ` Trond Myklebust 2010-01-07 4:53 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust 2010-01-07 4:53 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org From: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/write.c | 6 +++--- include/linux/backing-dev.h | 3 ++- mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 16 ++++++++++------ mm/truncate.c | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 910be28..36549b1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE); mark_inode_unstable_pages(inode); } @@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE); return 1; } return 0; @@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_UNSTABLE); nfs_clear_page_tag_locked(req); } return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26a..42c3e2a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,7 +36,8 @@ enum bdi_state { typedef int (congested_fn)(void *, int); enum bdi_stat_item { - BDI_RECLAIMABLE, + BDI_DIRTY, + BDI_UNSTABLE, BDI_WRITEBACK, NR_BDI_STAT_ITEMS }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca03..88f3655 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" + "BdiDirty: %8lu kB\n" + "BdiUnstable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" @@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "wb_list: %8u\n" "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)), + (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0..458387d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943..23d3fc6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, else avail_dirty = 0; - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + avail_dirty += bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); @@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping, global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } @@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } @@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); return 1; } return 0; diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..b0ce8fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); if (account_size) task_io_account_cancelled_write(account_size); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (2 preceding siblings ...) 2010-01-07 4:53 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust @ 2010-01-07 4:53 ` Trond Myklebust 2010-01-07 4:53 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 978de7f..d6d8048 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, mark_inode_unstable_pages(inode); return 0; } - if (wbc->nonblocking) + if (wbc->nonblocking || wbc->for_background) flags = 0; ret = nfs_commit_inode(inode, flags); if (ret > 0) ^ permalink raw reply related [flat|nested] 96+ messages in thread
* [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (3 preceding siblings ...) 2010-01-07 4:53 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust @ 2010-01-07 4:53 ` Trond Myklebust 4 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 4:53 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org If the call to do_writepages() succeeded in starting writeback, we do not know whether or not we will need to COMMIT any unstable writes until after the write RPC calls are finished. Currently, we assume that at least one write RPC call will have finished, and set I_DIRTY_DATASYNC by the time do_writepages is done, so that write_inode() is triggered. In order to ensure reliable operation (i.e. ensure that a single call to writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages are on disk) we need to first wait for filemap_fdatawait() to complete, then test for unstable pages. Since NFS is currently the only filesystem that has unstable pages, we can add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set, this will trigger a callback to a new address_space_operation to call the COMMIT. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- fs/nfs/file.c | 1 + fs/nfs/inode.c | 16 ---------------- fs/nfs/internal.h | 3 ++- fs/nfs/super.c | 2 -- fs/nfs/write.c | 33 ++++++++++++++++++++++++++++++++- include/linux/fs.h | 9 +++++++++ 7 files changed, 74 insertions(+), 21 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c..3640769 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync) } /* + * Commit the NFS unstable pages. + */ +static int commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (mapping->a_ops && mapping->a_ops->commit_unstable_pages) + return mapping->a_ops->commit_unstable_pages(mapping, wbc); + return 0; +} + +/* * Wait for writeback on an inode to complete. */ static void inode_wait_for_writeback(struct inode *inode) @@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + /* + * Special state for cleaning NFS unstable pages + */ + if (inode->i_state & I_UNSTABLE_PAGES) { + int err; + inode->i_state &= ~I_UNSTABLE_PAGES; + spin_unlock(&inode_lock); + err = commit_unstable_pages(mapping, wbc); + if (ret == 0) + ret = err; + spin_lock(&inode_lock); + } inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -533,6 +556,12 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } + } else if (inode->i_state & I_UNSTABLE_PAGES) { + /* + * The inode has got yet more unstable pages to + * commit. Requeue... + */ + redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES); inode->i_state |= flags; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..67e50ac 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, + .commit_unstable_pages = nfs_commit_unstable_pages, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..8341709 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d..7bb326f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..805c1a0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = { static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..910be28 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + mark_inode_unstable_pages(inode); } static int @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int flags = FLUSH_SYNC; + int ret; + + /* Don't commit yet if this is a non-blocking flush and there are + * outstanding writes for this mapping. + */ + if (wbc->sync_mode != WB_SYNC_ALL && + radix_tree_tagged(&NFS_I(inode)->nfs_page_tree, + NFS_PAGE_TAG_LOCKED)) { + mark_inode_unstable_pages(inode); + return 0; + } + if (wbc->nonblocking) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret > 0) + ret = 0; + return ret; +} + #else static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { return 0; } + +int nfs_commit_unstable_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9147ca8..de594b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -602,6 +602,8 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*commit_unstable_pages)(struct address_space *, + struct writeback_control *); }; /* @@ -1635,6 +1637,8 @@ struct super_operations { #define I_CLEAR 64 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define __I_UNSTABLE_PAGES 8 +#define I_UNSTABLE_PAGES (1 << __I_UNSTABLE_PAGES) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline void mark_inode_unstable_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_UNSTABLE_PAGES); +} + /** * inc_nlink - directly increment an inode's link count * @inode: inode ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes [not found] ` <1262839082.2185.15.camel@localhost> 2010-01-07 4:48 ` Wu Fengguang @ 2010-01-07 14:56 ` Wu Fengguang 2010-01-07 15:10 ` Trond Myklebust 1 sibling, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 14:56 UTC (permalink / raw) To: Myklebust, Trond Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > index d171696..910be28 100644 > > > --- a/fs/nfs/write.c > > > +++ b/fs/nfs/write.c > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > spin_unlock(&inode->i_lock); > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > + mark_inode_unstable_pages(inode); > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > this patch is applied. We won't ever need it. > > If the VM or VFS is doing it, then they ought to be fixed: there is no > reason to assume that all filesystems need to sync their inodes on > i_size changes. Sorry, one more question. It seems to me that you are replacing I_DIRTY_DATASYNC => write_inode() with I_UNSTABLE_PAGES => commit_unstable_pages() Is that change for the sake of clarity? Or to fix some problem? (This patch does fix some problems, but do they inherently require the above change?) Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-07 14:56 ` [PATCH 1/6] " Wu Fengguang @ 2010-01-07 15:10 ` Trond Myklebust 2010-01-08 1:17 ` Wu Fengguang 2010-01-08 9:25 ` Christoph Hellwig 0 siblings, 2 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-07 15:10 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > > index d171696..910be28 100644 > > > > --- a/fs/nfs/write.c > > > > +++ b/fs/nfs/write.c > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > > spin_unlock(&inode->i_lock); > > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > > + mark_inode_unstable_pages(inode); > > > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > > this patch is applied. We won't ever need it. > > > > If the VM or VFS is doing it, then they ought to be fixed: there is no > > reason to assume that all filesystems need to sync their inodes on > > i_size changes. > > Sorry, one more question. > > It seems to me that you are replacing > > I_DIRTY_DATASYNC => write_inode() > with > I_UNSTABLE_PAGES => commit_unstable_pages() > > Is that change for the sake of clarity? Or to fix some problem? > (This patch does fix some problems, but do they inherently require > the above change?) As I said previously, the write_inode() call is done _before_ you sync the dirty pages to the server, whereas commit_unstable_pages() wants to be done _after_ syncing. So the two are not the same, and we cannot replace commit_unstable_pages() with write_inode(). Replacing I_DIRTY_DATASYNC with I_UNSTABLE_PAGES is more for the sake of clarity. The difference between the two is that in the I_UNSTABLE_PAGES case, the inode itself isn't actually dirty; it just contains pages that are not guaranteed to be on permanent storage until we commit. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-07 15:10 ` Trond Myklebust @ 2010-01-08 1:17 ` Wu Fengguang 2010-01-08 1:37 ` Trond Myklebust 2010-01-08 9:25 ` Christoph Hellwig 1 sibling, 1 reply; 96+ messages in thread From: Wu Fengguang @ 2010-01-08 1:17 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote: > On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: > > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > > > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > > > index d171696..910be28 100644 > > > > > --- a/fs/nfs/write.c > > > > > +++ b/fs/nfs/write.c > > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > > > spin_unlock(&inode->i_lock); > > > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > > > + mark_inode_unstable_pages(inode); > > > > > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > > > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > > > this patch is applied. We won't ever need it. > > > > > > If the VM or VFS is doing it, then they ought to be fixed: there is no > > > reason to assume that all filesystems need to sync their inodes on > > > i_size changes. > > > > Sorry, one more question. > > > > It seems to me that you are replacing > > > > I_DIRTY_DATASYNC => write_inode() > > with > > I_UNSTABLE_PAGES => commit_unstable_pages() > > > > Is that change for the sake of clarity? Or to fix some problem? > > (This patch does fix some problems, but do they inherently require > > the above change?) > > As I said previously, the write_inode() call is done _before_ you sync > the dirty pages to the server, whereas commit_unstable_pages() wants to > be done _after_ syncing. So the two are not the same, and we cannot > replace commit_unstable_pages() with write_inode(). This is the ordering: 0 do_writepages() 1 if (I_DIRTY_SYNC | I_DIRTY_DATASYNC) 2 write_inode() 3 if (wait) 4 filemap_fdatawait() 5 if (I_UNSTABLE_PAGES) 6 commit_unstable_pages() The page is synced to NFS server in line 0. The only difference is write_inode() is called before filemap_fdatawait(), while commit_unstable_pages() is called after it. Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I still cannot understand the difference.. > Replacing I_DIRTY_DATASYNC with I_UNSTABLE_PAGES is more for the sake of > clarity. The difference between the two is that in the I_UNSTABLE_PAGES > case, the inode itself isn't actually dirty; it just contains pages that > are not guaranteed to be on permanent storage until we commit. And I_UNSTABLE_PAGES is necessary for calling commit_unstable_pages() :) Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-08 1:17 ` Wu Fengguang @ 2010-01-08 1:37 ` Trond Myklebust 2010-01-08 1:53 ` Wu Fengguang 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-08 1:37 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, 2010-01-08 at 09:17 +0800, Wu Fengguang wrote: > On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote: > > On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: > > > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > > > > > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > > > > index d171696..910be28 100644 > > > > > > --- a/fs/nfs/write.c > > > > > > +++ b/fs/nfs/write.c > > > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > > > > spin_unlock(&inode->i_lock); > > > > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > > > > + mark_inode_unstable_pages(inode); > > > > > > > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > > > > > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > > > > this patch is applied. We won't ever need it. > > > > > > > > If the VM or VFS is doing it, then they ought to be fixed: there is no > > > > reason to assume that all filesystems need to sync their inodes on > > > > i_size changes. > > > > > > Sorry, one more question. > > > > > > It seems to me that you are replacing > > > > > > I_DIRTY_DATASYNC => write_inode() > > > with > > > I_UNSTABLE_PAGES => commit_unstable_pages() > > > > > > Is that change for the sake of clarity? Or to fix some problem? > > > (This patch does fix some problems, but do they inherently require > > > the above change?) > > > > As I said previously, the write_inode() call is done _before_ you sync > > the dirty pages to the server, whereas commit_unstable_pages() wants to > > be done _after_ syncing. So the two are not the same, and we cannot > > replace commit_unstable_pages() with write_inode(). > > This is the ordering: > > 0 do_writepages() > 1 if (I_DIRTY_SYNC | I_DIRTY_DATASYNC) > 2 write_inode() > 3 if (wait) > 4 filemap_fdatawait() > 5 if (I_UNSTABLE_PAGES) > 6 commit_unstable_pages() > > The page is synced to NFS server in line 0. > > The only difference is write_inode() is called before filemap_fdatawait(), > while commit_unstable_pages() is called after it. > > Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I > still cannot understand the difference.. The difference is precisely that... In the case of WB_SYNC_ALL we want the call to filemap_fdatawait() to occur before we call commit_unstable_pages(), so that we know that all the in-flight write rpc calls are done before we ask that they be committed to stable storage. In the case of WB_SYNC_NONE, there is no wait, and so we are forced to play games with heuristics and/or add the force_commit_unstable flag because we don't wait for the dirty pages to be cleaned. I don't like this, but those are the semantics that we've defined for WB_SYNC_NONE. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-08 1:37 ` Trond Myklebust @ 2010-01-08 1:53 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2010-01-08 1:53 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, Jan 08, 2010 at 09:37:31AM +0800, Trond Myklebust wrote: > On Fri, 2010-01-08 at 09:17 +0800, Wu Fengguang wrote: > > On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote: > > > On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: > > > > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote: > > > > > > > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > > > > > > > index d171696..910be28 100644 > > > > > > > --- a/fs/nfs/write.c > > > > > > > +++ b/fs/nfs/write.c > > > > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req) > > > > > > > spin_unlock(&inode->i_lock); > > > > > > > inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > > > > > > inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); > > > > > > > - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); > > > > > > > + mark_inode_unstable_pages(inode); > > > > > > > > > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size. > > > > > > > > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after > > > > > this patch is applied. We won't ever need it. > > > > > > > > > > If the VM or VFS is doing it, then they ought to be fixed: there is no > > > > > reason to assume that all filesystems need to sync their inodes on > > > > > i_size changes. > > > > > > > > Sorry, one more question. > > > > > > > > It seems to me that you are replacing > > > > > > > > I_DIRTY_DATASYNC => write_inode() > > > > with > > > > I_UNSTABLE_PAGES => commit_unstable_pages() > > > > > > > > Is that change for the sake of clarity? Or to fix some problem? > > > > (This patch does fix some problems, but do they inherently require > > > > the above change?) > > > > > > As I said previously, the write_inode() call is done _before_ you sync > > > the dirty pages to the server, whereas commit_unstable_pages() wants to > > > be done _after_ syncing. So the two are not the same, and we cannot > > > replace commit_unstable_pages() with write_inode(). > > > > This is the ordering: > > > > 0 do_writepages() > > 1 if (I_DIRTY_SYNC | I_DIRTY_DATASYNC) > > 2 write_inode() > > 3 if (wait) > > 4 filemap_fdatawait() > > 5 if (I_UNSTABLE_PAGES) > > 6 commit_unstable_pages() > > > > The page is synced to NFS server in line 0. > > > > The only difference is write_inode() is called before filemap_fdatawait(), > > while commit_unstable_pages() is called after it. > > > > Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I > > still cannot understand the difference.. > > The difference is precisely that... Thanks, I got it. > In the case of WB_SYNC_ALL we want the call to filemap_fdatawait() to > occur before we call commit_unstable_pages(), so that we know that all > the in-flight write rpc calls are done before we ask that they be > committed to stable storage. That's good order for WB_SYNC_ALL. However this is optimizing a minor case, and what I cared is WB_SYNC_NONE :) > In the case of WB_SYNC_NONE, there is no wait, and so we are forced to > play games with heuristics and/or add the force_commit_unstable flag > because we don't wait for the dirty pages to be cleaned. I don't like > this, but those are the semantics that we've defined for WB_SYNC_NONE. For WB_SYNC_NONE we will now also wait for WRITE completion with the combination of NFS_PAGE_TAG_LOCKED-based-bail-out and redirty_tail(). This is retry based, so less elegant. But that's not the whole story. The I_UNSTABLE_PAGES+commit_unstable_pages() scheme seems elegant for WB_SYNC_ALL, however it may break the pipeline for big files in a perfect loop: loop { WRITE 4MB COMMIT 4MB } While the retry based WB_SYNC_ALL will keep back-off COMMITs because do_writepages() keep submit new WRITEs. So its loop would be loop { WRITE 4MB <skip COMMIT> WRITE 4MB <skip COMMIT> WRITE 4MB <skip COMMIT> WRITE 4MB <skip COMMIT> ... <redirty_tail timeout> COMMIT 400MB } That can be improved by lifting the writeback chunk size from 4MB to >=128MB. Thanks, Fengguang ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-07 15:10 ` Trond Myklebust 2010-01-08 1:17 ` Wu Fengguang @ 2010-01-08 9:25 ` Christoph Hellwig 2010-01-08 13:46 ` Trond Myklebust 1 sibling, 1 reply; 96+ messages in thread From: Christoph Hellwig @ 2010-01-08 9:25 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 10:10:22AM -0500, Trond Myklebust wrote: > As I said previously, the write_inode() call is done _before_ you sync > the dirty pages to the server, whereas commit_unstable_pages() wants to > be done _after_ syncing. So the two are not the same, and we cannot > replace commit_unstable_pages() with write_inode(). But that's more an accident of how this code was written. The right order nees to be to write the pages first, then call write_inode. Most modern filesystems have to work around this in their write_inode method by waiting for the pages themselves. I already fixed the same ordering issue in fsync, and the writeback code is next on the agenda. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-08 9:25 ` Christoph Hellwig @ 2010-01-08 13:46 ` Trond Myklebust 2010-01-08 13:54 ` Christoph Hellwig 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-08 13:46 UTC (permalink / raw) To: Christoph Hellwig Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, 2010-01-08 at 04:25 -0500, Christoph Hellwig wrote: > On Thu, Jan 07, 2010 at 10:10:22AM -0500, Trond Myklebust wrote: > > As I said previously, the write_inode() call is done _before_ you sync > > the dirty pages to the server, whereas commit_unstable_pages() wants to > > be done _after_ syncing. So the two are not the same, and we cannot > > replace commit_unstable_pages() with write_inode(). > > But that's more an accident of how this code was written. The right > order nees to be to write the pages first, then call write_inode. Most > modern filesystems have to work around this in their write_inode method > by waiting for the pages themselves. I already fixed the same ordering > issue in fsync, and the writeback code is next on the agenda. > Could we in that case replace write_inode() with something that takes a struct writeback_control? It is very useful to have full information about the write range and flags as it allows us to tweak the COMMIT RPC call. Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-08 13:46 ` Trond Myklebust @ 2010-01-08 13:54 ` Christoph Hellwig 2010-01-08 14:15 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Christoph Hellwig @ 2010-01-08 13:54 UTC (permalink / raw) To: Trond Myklebust Cc: Christoph Hellwig, Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, Jan 08, 2010 at 08:46:46AM -0500, Trond Myklebust wrote: > Could we in that case replace write_inode() with something that takes a > struct writeback_control? It is very useful to have full information > about the write range and flags as it allows us to tweak the COMMIT RPC > call. At this point I do not plan to change the write_inode interface. But changing the ->write_inode operation to take a writeback control instead of the sync flag should be a prety easy change if you want to do it. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes 2010-01-08 13:54 ` Christoph Hellwig @ 2010-01-08 14:15 ` Trond Myklebust 0 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-08 14:15 UTC (permalink / raw) To: Christoph Hellwig Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Fri, 2010-01-08 at 08:54 -0500, Christoph Hellwig wrote: > On Fri, Jan 08, 2010 at 08:46:46AM -0500, Trond Myklebust wrote: > > Could we in that case replace write_inode() with something that takes a > > struct writeback_control? It is very useful to have full information > > about the write range and flags as it allows us to tweak the COMMIT RPC > > call. > > At this point I do not plan to change the write_inode interface. But > changing the ->write_inode operation to take a writeback control instead > of the sync flag should be a prety easy change if you want to do it. > OK. Please could you let me know when you're done with your changes so that I can adapt this patch? Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (2 preceding siblings ...) 2010-01-06 20:51 ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust @ 2010-01-06 20:51 ` Trond Myklebust [not found] ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-06 20:51 ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust ` (2 subsequent siblings) 6 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> --- fs/nfs/write.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 978de7f..d6d8048 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, mark_inode_unstable_pages(inode); return 0; } - if (wbc->nonblocking) + if (wbc->nonblocking || wbc->for_background) flags = 0; ret = nfs_commit_inode(inode, flags); if (ret > 0) ^ permalink raw reply related [flat|nested] 96+ messages in thread
[parent not found: <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set [not found] ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 2:32 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 2:32 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > @@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping, > mark_inode_unstable_pages(inode); > return 0; > } > - if (wbc->nonblocking) > + if (wbc->nonblocking || wbc->for_background) > flags = 0; > ret = nfs_commit_inode(inode, flags); > if (ret > 0) Acked-by: Wu Fengguang <fengguang.wu@intel.com> ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (3 preceding siblings ...) 2010-01-06 20:51 ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust @ 2010-01-06 20:51 ` Trond Myklebust 2010-01-07 1:56 ` Wu Fengguang 2010-01-06 20:51 ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust 2010-01-06 21:44 ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara 6 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org Speeds up the accounting in balance_dirty_pages() for non-nfs devices. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/client.c | 1 + include/linux/backing-dev.h | 6 ++++++ mm/page-writeback.c | 16 +++++++++++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713..d0b060a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 42c3e2a..8b45166 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 +#define BDI_CAP_ACCT_UNSTABLE 0x00000200 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) return bdi == &default_backing_dev_info; } +static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index aa26b0f..d90a0db 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_cap_account_unstable(bdi)) + avail_dirty += bdi_stat(bdi, BDI_UNSTABLE); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); } @@ -512,8 +513,9 @@ static void balance_dirty_pages(struct address_space *mapping, nr_unstable_nfs; nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + - bdi_stat(bdi, BDI_UNSTABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -563,11 +565,15 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); + if (bdi_cap_account_unstable(bdi)) + bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
* Re: [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices 2010-01-06 20:51 ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust @ 2010-01-07 1:56 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 1:56 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote: > avail_dirty += bdi_stat(bdi, BDI_DIRTY) + > - bdi_stat(bdi, BDI_UNSTABLE) + > bdi_stat(bdi, BDI_WRITEBACK); > + if (bdi_cap_account_unstable(bdi)) > + avail_dirty += bdi_stat(bdi, BDI_UNSTABLE); It seems that not changing the bdi_stat()s makes more readable code, otherwise looks OK to me. Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> ^ permalink raw reply [flat|nested] 96+ messages in thread
* [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (4 preceding siblings ...) 2010-01-06 20:51 ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust @ 2010-01-06 20:51 ` Trond Myklebust [not found] ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> 2010-01-06 21:44 ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara 6 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw) To: Wu Fengguang Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org From: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com> Acked-by: Jan Kara <jack@suse.cz> --- fs/nfs/write.c | 6 +++--- include/linux/backing-dev.h | 3 ++- mm/backing-dev.c | 6 ++++-- mm/filemap.c | 2 +- mm/page-writeback.c | 16 ++++++++++------ mm/truncate.c | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ee3daf4..978de7f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE); mark_inode_unstable_pages(inode); } @@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE); return 1; } return 0; @@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_UNSTABLE); nfs_clear_page_tag_locked(req); } return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26a..42c3e2a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,7 +36,8 @@ enum bdi_state { typedef int (congested_fn)(void *, int); enum bdi_stat_item { - BDI_RECLAIMABLE, + BDI_DIRTY, + BDI_UNSTABLE, BDI_WRITEBACK, NR_BDI_STAT_ITEMS }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca03..88f3655 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" + "BdiDirty: %8lu kB\n" + "BdiUnstable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" @@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "wb_list: %8u\n" "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)), + (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0..458387d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ede5356..aa26b0f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, else avail_dirty = 0; - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + avail_dirty += bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE) + bdi_stat(bdi, BDI_WRITEBACK); *pbdi_dirty = min(*pbdi_dirty, avail_dirty); @@ -511,7 +512,8 @@ static void balance_dirty_pages(struct address_space *mapping, nr_unstable_nfs; nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) @@ -561,10 +563,12 @@ static void balance_dirty_pages(struct address_space *mapping, * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) + + bdi_stat_sum(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + + bdi_stat(bdi, BDI_UNSTABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } @@ -1086,7 +1090,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } @@ -1262,7 +1266,7 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); return 1; } return 0; diff --git a/mm/truncate.c b/mm/truncate.c index 342deee..b0ce8fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); + BDI_DIRTY); if (account_size) task_io_account_cancelled_write(account_size); } ^ permalink raw reply related [flat|nested] 96+ messages in thread
[parent not found: <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]
* Re: [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE [not found] ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 1:48 ` Wu Fengguang 0 siblings, 0 replies; 96+ messages in thread From: Wu Fengguang @ 2010-01-07 1:48 UTC (permalink / raw) To: Trond Myklebust Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org > seq_printf(m, > "BdiWriteback: %8lu kB\n" > - "BdiReclaimable: %8lu kB\n" > + "BdiDirty: %8lu kB\n" > + "BdiUnstable: %8lu kB\n" > "BdiDirtyThresh: %8lu kB\n" > "DirtyThresh: %8lu kB\n" > "BackgroundThresh: %8lu kB\n" This also reduces one synthetic concept ;) Thanks! Reviewed-by: Wu Fengguang <fengguang.wu@intel.com> ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> ` (5 preceding siblings ...) 2010-01-06 20:51 ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust @ 2010-01-06 21:44 ` Jan Kara 2010-01-06 22:03 ` Trond Myklebust 6 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2010-01-06 21:44 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed 06-01-10 15:51:10, Trond Myklebust wrote: > Peter Zijlstra (1): > VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE > > Trond Myklebust (5): > NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set > VM: Use per-bdi unstable accounting to improve use of wbc->force_commit > VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices > VM/NFS: The VM must tell the filesystem when to free reclaimable pages > VFS: Ensure that writeback_single_inode() commits unstable writes I think the series would be nicer if you made Peter's patch #2 and join your patches "VM/NFS: The VM must tell the filesystem when to free reclaimable pages" and "VM: Use per-bdi unstable accounting to improve use of wbc->force_commit" Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 21:44 ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara @ 2010-01-06 22:03 ` Trond Myklebust 0 siblings, 0 replies; 96+ messages in thread From: Trond Myklebust @ 2010-01-06 22:03 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Peter Zijlstra, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 22:44 +0100, Jan Kara wrote: > On Wed 06-01-10 15:51:10, Trond Myklebust wrote: > > Peter Zijlstra (1): > > VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE > > > > Trond Myklebust (5): > > NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set > > VM: Use per-bdi unstable accounting to improve use of wbc->force_commit > > VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices > > VM/NFS: The VM must tell the filesystem when to free reclaimable pages > > VFS: Ensure that writeback_single_inode() commits unstable writes > I think the series would be nicer if you made Peter's patch #2 and join > your patches > "VM/NFS: The VM must tell the filesystem when to free reclaimable pages" > and > "VM: Use per-bdi unstable accounting to improve use of wbc->force_commit" > > Honza Indeed, and if everyone is OK with Peter's patch, I'll do that. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads 2010-01-06 20:51 ` [PATCH 0/6] " Trond Myklebust [not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org> @ 2010-01-07 8:16 ` Peter Zijlstra 1 sibling, 0 replies; 96+ messages in thread From: Peter Zijlstra @ 2010-01-07 8:16 UTC (permalink / raw) To: Trond Myklebust Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org On Wed, 2010-01-06 at 15:51 -0500, Trond Myklebust wrote: > OK, here is the full series so far. I'm resending because I had to fix > up a couple of BDI_UNSTABLE typos in Peter's patch... Looks good and thanks for fixing things up! Acked-by: Peter Zijlstra <peterz@infradead.org> > > Peter Zijlstra (1): > VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE > > Trond Myklebust (5): > NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set > VM: Use per-bdi unstable accounting to improve use of wbc->force_commit > VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices > VM/NFS: The VM must tell the filesystem when to free reclaimable pages > VFS: Ensure that writeback_single_inode() commits unstable writes > > > fs/fs-writeback.c | 31 ++++++++++++++++++++++++++++++- > fs/nfs/client.c | 1 + > fs/nfs/file.c | 1 + > fs/nfs/inode.c | 16 ---------------- > fs/nfs/internal.h | 3 ++- > fs/nfs/super.c | 2 -- > fs/nfs/write.c | 39 +++++++++++++++++++++++++++++++++++---- > include/linux/backing-dev.h | 9 ++++++++- > include/linux/fs.h | 9 +++++++++ > include/linux/writeback.h | 5 +++++ > mm/backing-dev.c | 6 ++++-- > mm/filemap.c | 2 +- > mm/page-writeback.c | 30 ++++++++++++++++++++++++------ > mm/truncate.c | 2 +- > 14 files changed, 121 insertions(+), 35 deletions(-) > ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-19 14:25 ` Steve Rago 2009-12-22 1:59 ` Wu Fengguang @ 2009-12-22 12:25 ` Jan Kara [not found] ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> 2009-12-22 16:20 ` Steve Rago 1 sibling, 2 replies; 96+ messages in thread From: Jan Kara @ 2009-12-22 12:25 UTC (permalink / raw) To: Steve Rago Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach Hi, > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote: > > Hi Steve, > > > > // I should really read the NFS code, but maybe you can help us better > > // understand the problem :) > > > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote: > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote: > > > > Eager Writeback for NFS Clients > > > > ------------------------------- > > > > Prevent applications that write large sequential streams of data (like backup, for example) > > > > from entering into a memory pressure state, which degrades performance by falling back to > > > > synchronous operations (both synchronous writes and additional commits). > > > > What exactly is the "memory pressure state" condition? What's the > > code to do the "synchronous writes and additional commits" and maybe > > how they are triggered? > > Memory pressure occurs when most of the client pages have been dirtied > by an application (think backup server writing multi-gigabyte files that > exceed the size of main memory). The system works harder to be able to > free dirty pages so that they can be reused. For a local file system, > this means writing the pages to disk. For NFS, however, the writes > leave the pages in an "unstable" state until the server responds to a > commit request. Generally speaking, commit processing is far more > expensive than write processing on the server; both are done with the > inode locked, but since the commit takes so long, all writes are > blocked, which stalls the pipeline. I'm not sure I understand the problem you are trying to solve. So we generate dirty pages on an NFS filesystem. At some point we reach the dirty_threshold so writing process is throttled and forced to do some writes. For NFS it takes a longer time before we can really free the pages because sever has to acknowledge the write (BTW e.g. for optical media like DVD-RW it also takes a long time to really write the data). Now the problem you are trying to solve is that the system basically gets out of free memory (so that it has to start doing synchronous writeback from the allocator) because the writer manages to dirty remaining free memory before the submitted writes are acknowledged from the server? If that is so, then it might make sence to introduce also per-bdi equivalent of dirty_background_ratio so that we can start background writeback of dirty data at different times for different backing devices - that would make sence to me in general, not only for NFS. Another complementary piece to my above proposal would be something like Fenguang's patches that actually don't let the process dirty too much memory by throttling it in balance_dirty_pages until number of unstable pages gets lower. > > > > This is accomplished by preventing the client application from > > > > dirtying pages faster than they can be written to the server: > > > > clients write pages eagerly instead of lazily. > > > > We already have the balance_dirty_pages() based global throttling. > > So what makes the performance difference in your proposed "per-inode" throttling? > > balance_dirty_pages() does have much larger threshold than yours. > > I originally spent several months playing with the balance_dirty_pages > algorithm. The main drawback is that it affects more than the inodes > that the caller is writing and that the control of what to do is too Can you be more specific here please? > coarse. My final changes (which worked well for 1Gb connections) were > more heuristic than the changes in the patch -- I basically had to come > up with alternate ways to write pages without generating commits on > inodes. Doing this was distasteful, as I was adjusting generic system > behavior for an NFS-only problem. Then a colleague found Peter > Staubach's patch, which worked just as well in less code, and isolated > the change to the NFS component, which is where it belongs. As I said above, the problem of slow writes happens also in other cases. What's specific to NFS is that pages aren't in writeback state for long but they instead stay in unstable state. But balance_dirty_pages should handle that and if it does not, it should be fixed. > > > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables > > > > the feature. Otherwise it contains the maximum number of outstanding NFS writes that can be > > > > in flight for a given file. This is used to block the application from dirtying more pages > > > > until the writes are complete. > > > > What if we do heuristic write-behind for sequential NFS writes? > > Part of the patch does implement a heuristic write-behind. See where > nfs_wb_eager() is called. I believe that if we had per-bdi dirty_background_ratio and set it low for NFS's bdi, then the write-behind logic would not be needed (essentially the flusher thread should submit the writes to the server early). Honza -- Jan Kara <jack@suse.cz> SuSE CR Labs ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> @ 2009-12-22 12:38 ` Peter Zijlstra 2009-12-22 12:55 ` Jan Kara 0 siblings, 1 reply; 96+ messages in thread From: Peter Zijlstra @ 2009-12-22 12:38 UTC (permalink / raw) To: Jan Kara Cc: Steve Rago, Wu Fengguang, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > I believe that if we had per-bdi dirty_background_ratio and set it low > for NFS's bdi, There's two things there I think: 1) bdi_background 2) different background ratios per bdi 1) could be 'trivially' done much like we do bdi_dirty in get_dirty_limits(). 2) I'm not at all convinced we want to go there. ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 12:38 ` Peter Zijlstra @ 2009-12-22 12:55 ` Jan Kara 0 siblings, 0 replies; 96+ messages in thread From: Jan Kara @ 2009-12-22 12:55 UTC (permalink / raw) To: Peter Zijlstra Cc: Jan Kara, Steve Rago, Wu Fengguang, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > > I believe that if we had per-bdi dirty_background_ratio and set it low > > for NFS's bdi, > > There's two things there I think: > 1) bdi_background > 2) different background ratios per bdi Right. > 1) could be 'trivially' done much like we do bdi_dirty in > get_dirty_limits(). > > 2) I'm not at all convinced we want to go there. Yeah. Doing 1) and playing with bdi->min_ratio and bdi->max_ratio should be the first thing we should try... Honza -- Jan Kara <jack@suse.cz> SuSE CR Labs ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 12:25 ` Jan Kara [not found] ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org> @ 2009-12-22 16:20 ` Steve Rago 2009-12-23 18:39 ` Jan Kara 1 sibling, 1 reply; 96+ messages in thread From: Steve Rago @ 2009-12-22 16:20 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > > I originally spent several months playing with the balance_dirty_pages > > algorithm. The main drawback is that it affects more than the inodes > > that the caller is writing and that the control of what to do is too > Can you be more specific here please? Sure; balance_dirty_pages() will schedule writeback by the flusher thread once the number of dirty pages exceeds dirty_background_ratio. The flusher thread calls writeback_inodes_wb() to flush all dirty inodes associated with the bdi. Similarly, the process dirtying the pages will call writeback_inodes_wbc() when it's bdi threshold has been exceeded. The first problem is that these functions process all dirty inodes with the same backing device, which can lead to excess (duplicate) flushing of the same inode. Second, there is no distinction between pages that need to be committed and pages that have commits pending in NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't be cleaned any faster by sending more commits). This tends to overstate the amount of memory that can be cleaned, leading to additional commit requests. Third, these functions generate a commit for each set of writes they do, which might not be appropriate. For background writing, you'd like to delay the commit as long as possible. [snip] > > > > Part of the patch does implement a heuristic write-behind. See where > > nfs_wb_eager() is called. > I believe that if we had per-bdi dirty_background_ratio and set it low > for NFS's bdi, then the write-behind logic would not be needed > (essentially the flusher thread should submit the writes to the server > early). > > Honza Maybe so, but you still need something to prevent the process that is dirtying pages from continuing, because a process can always write to memory faster than writing to disk/network, so the flusher won't be able to keep up. Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-22 16:20 ` Steve Rago @ 2009-12-23 18:39 ` Jan Kara [not found] ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> 0 siblings, 1 reply; 96+ messages in thread From: Jan Kara @ 2009-12-23 18:39 UTC (permalink / raw) To: Steve Rago Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach On Tue 22-12-09 11:20:15, Steve Rago wrote: > > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > > > I originally spent several months playing with the balance_dirty_pages > > > algorithm. The main drawback is that it affects more than the inodes > > > that the caller is writing and that the control of what to do is too > > Can you be more specific here please? > > Sure; balance_dirty_pages() will schedule writeback by the flusher > thread once the number of dirty pages exceeds dirty_background_ratio. > The flusher thread calls writeback_inodes_wb() to flush all dirty inodes > associated with the bdi. Similarly, the process dirtying the pages will > call writeback_inodes_wbc() when it's bdi threshold has been exceeded. > The first problem is that these functions process all dirty inodes with > the same backing device, which can lead to excess (duplicate) flushing > of the same inode. Second, there is no distinction between pages that > need to be committed and pages that have commits pending in > NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't > be cleaned any faster by sending more commits). This tends to overstate > the amount of memory that can be cleaned, leading to additional commit > requests. Third, these functions generate a commit for each set of > writes they do, which might not be appropriate. For background writing, > you'd like to delay the commit as long as possible. Ok, I get it. Thanks for explanation. The problem with more writing threads bites us also for ordinary SATA drives (the IO pattern and thus throughput gets worse and worse the more threads do writes). The plan is to let only flusher thread do the IO and throttled thread in balance_dirty_pages just waits for flusher thread to do the work. There were even patches for this floating around but I'm not sure what's happened to them. So that part of the problem should be easy to solve. Another part is about sending commits - if we have just one thread doing flushing, we have no problems with excessive commits for one inode. You're right that we may want to avoid sending commits for background writeback but until we send the commit, pages are just accumulating in the unstable state, aren't they? So we might want to periodically send the commit for the inode anyway to get rid of those pages. So from this point of view, sending commit after each writepages call does not seem like a so bad idea - although it might be more appropriate to send it some time after the writepages call when we are not close to dirty limit so that server has more time to do more natural "unforced" writeback... > > > Part of the patch does implement a heuristic write-behind. See where > > > nfs_wb_eager() is called. > > I believe that if we had per-bdi dirty_background_ratio and set it low > > for NFS's bdi, then the write-behind logic would not be needed > > (essentially the flusher thread should submit the writes to the server > > early). > > > Maybe so, but you still need something to prevent the process that is > dirtying pages from continuing, because a process can always write to > memory faster than writing to disk/network, so the flusher won't be able > to keep up. Yes, I agree that part is needed. But Fengguang already had patches in that direction if my memory serves me well. So to recap: If we block tasks in balance_dirty_pages until unstable pages are committed and make just one thread do the writing, what else is missing to make you happy? :) Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR ^ permalink raw reply [flat|nested] 96+ messages in thread
[parent not found: <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]
* Re: [PATCH] improve the performance of large sequential write NFS workloads [not found] ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org> @ 2009-12-23 20:16 ` Steve Rago 2009-12-23 21:49 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Steve Rago @ 2009-12-23 20:16 UTC (permalink / raw) To: Jan Kara Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach On Wed, 2009-12-23 at 19:39 +0100, Jan Kara wrote: > On Tue 22-12-09 11:20:15, Steve Rago wrote: > > > > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > > > > I originally spent several months playing with the balance_dirty_pages > > > > algorithm. The main drawback is that it affects more than the inodes > > > > that the caller is writing and that the control of what to do is too > > > Can you be more specific here please? > > > > Sure; balance_dirty_pages() will schedule writeback by the flusher > > thread once the number of dirty pages exceeds dirty_background_ratio. > > The flusher thread calls writeback_inodes_wb() to flush all dirty inodes > > associated with the bdi. Similarly, the process dirtying the pages will > > call writeback_inodes_wbc() when it's bdi threshold has been exceeded. > > The first problem is that these functions process all dirty inodes with > > the same backing device, which can lead to excess (duplicate) flushing > > of the same inode. Second, there is no distinction between pages that > > need to be committed and pages that have commits pending in > > NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't > > be cleaned any faster by sending more commits). This tends to overstate > > the amount of memory that can be cleaned, leading to additional commit > > requests. Third, these functions generate a commit for each set of > > writes they do, which might not be appropriate. For background writing, > > you'd like to delay the commit as long as possible. > Ok, I get it. Thanks for explanation. The problem with more writing > threads bites us also for ordinary SATA drives (the IO pattern and thus > throughput gets worse and worse the more threads do writes). The plan is to > let only flusher thread do the IO and throttled thread in > balance_dirty_pages just waits for flusher thread to do the work. There > were even patches for this floating around but I'm not sure what's happened > to them. So that part of the problem should be easy to solve. > Another part is about sending commits - if we have just one thread doing > flushing, we have no problems with excessive commits for one inode. You're > right that we may want to avoid sending commits for background writeback > but until we send the commit, pages are just accumulating in the unstable > state, aren't they? So we might want to periodically send the commit for > the inode anyway to get rid of those pages. So from this point of view, > sending commit after each writepages call does not seem like a so bad idea > - although it might be more appropriate to send it some time after the > writepages call when we are not close to dirty limit so that server has > more time to do more natural "unforced" writeback... When to send the commit is a complex question to answer. If you delay it long enough, the server's flusher threads will have already done most of the work for you, so commits can be cheap, but you don't have access to the necessary information to figure this out. You can't delay it too long, though, because the unstable pages on the client will grow too large, creating memory pressure. I have a second patch, which I haven't posted yet, that adds feedback piggy-backed on the NFS write response, which allows the NFS client to free pages proactively. This greatly reduces the need to send commit messages, but it extends the protocol (in a backward-compatible manner), so it could be hard to convince people to accept. > > > > > Part of the patch does implement a heuristic write-behind. See where > > > > nfs_wb_eager() is called. > > > I believe that if we had per-bdi dirty_background_ratio and set it low > > > for NFS's bdi, then the write-behind logic would not be needed > > > (essentially the flusher thread should submit the writes to the server > > > early). > > > > > Maybe so, but you still need something to prevent the process that is > > dirtying pages from continuing, because a process can always write to > > memory faster than writing to disk/network, so the flusher won't be able > > to keep up. > Yes, I agree that part is needed. But Fengguang already had patches in > that direction if my memory serves me well. > > So to recap: If we block tasks in balance_dirty_pages until unstable > pages are committed and make just one thread do the writing, what else is > missing to make you happy? :) > Honza As long as the performance improves substantially, I'll be happy. Part of the problem that isn't addressed by your summary is the synchronous writes. With the eager writeback patch, these are removed [see the short-circuit in wb_priority()]. I would have expected that change to be controversial, but I've not heard any complaints (yet). If the process or the bdi flusher is writing and committing regularly, then pages should be recycled quickly and the change shouldn't matter, but I'd need to run my systemtap scripts to make sure. Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 20:16 ` Steve Rago @ 2009-12-23 21:49 ` Trond Myklebust 2009-12-23 23:13 ` Steve Rago 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-23 21:49 UTC (permalink / raw) To: Steve Rago Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach On Wed, 2009-12-23 at 15:16 -0500, Steve Rago wrote: > On Wed, 2009-12-23 at 19:39 +0100, Jan Kara wrote: > > On Tue 22-12-09 11:20:15, Steve Rago wrote: > > > > > > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote: > > > > > I originally spent several months playing with the balance_dirty_pages > > > > > algorithm. The main drawback is that it affects more than the inodes > > > > > that the caller is writing and that the control of what to do is too > > > > Can you be more specific here please? > > > > > > Sure; balance_dirty_pages() will schedule writeback by the flusher > > > thread once the number of dirty pages exceeds dirty_background_ratio. > > > The flusher thread calls writeback_inodes_wb() to flush all dirty inodes > > > associated with the bdi. Similarly, the process dirtying the pages will > > > call writeback_inodes_wbc() when it's bdi threshold has been exceeded. > > > The first problem is that these functions process all dirty inodes with > > > the same backing device, which can lead to excess (duplicate) flushing > > > of the same inode. Second, there is no distinction between pages that > > > need to be committed and pages that have commits pending in > > > NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't > > > be cleaned any faster by sending more commits). This tends to overstate > > > the amount of memory that can be cleaned, leading to additional commit > > > requests. Third, these functions generate a commit for each set of > > > writes they do, which might not be appropriate. For background writing, > > > you'd like to delay the commit as long as possible. > > Ok, I get it. Thanks for explanation. The problem with more writing > > threads bites us also for ordinary SATA drives (the IO pattern and thus > > throughput gets worse and worse the more threads do writes). The plan is to > > let only flusher thread do the IO and throttled thread in > > balance_dirty_pages just waits for flusher thread to do the work. There > > were even patches for this floating around but I'm not sure what's happened > > to them. So that part of the problem should be easy to solve. > > Another part is about sending commits - if we have just one thread doing > > flushing, we have no problems with excessive commits for one inode. You're > > right that we may want to avoid sending commits for background writeback > > but until we send the commit, pages are just accumulating in the unstable > > state, aren't they? So we might want to periodically send the commit for > > the inode anyway to get rid of those pages. So from this point of view, > > sending commit after each writepages call does not seem like a so bad idea > > - although it might be more appropriate to send it some time after the > > writepages call when we are not close to dirty limit so that server has > > more time to do more natural "unforced" writeback... > > When to send the commit is a complex question to answer. If you delay > it long enough, the server's flusher threads will have already done most > of the work for you, so commits can be cheap, but you don't have access > to the necessary information to figure this out. You can't delay it too > long, though, because the unstable pages on the client will grow too > large, creating memory pressure. I have a second patch, which I haven't > posted yet, that adds feedback piggy-backed on the NFS write response, > which allows the NFS client to free pages proactively. This greatly > reduces the need to send commit messages, but it extends the protocol > (in a backward-compatible manner), so it could be hard to convince > people to accept. There are only 2 cases when the client should send a COMMIT: 1. When it hits a synchronisation point (i.e. when the user calls f/sync(), or close(), or when the user sets/clears a file lock). 2. When memory pressure causes the VM to wants to free up those pages that are marked as clean but unstable. We should never be sending COMMIT in any other situation, since that would imply that the client somehow has better information on how to manage dirty pages on the server than the server's own VM. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 21:49 ` Trond Myklebust @ 2009-12-23 23:13 ` Steve Rago 2009-12-23 23:44 ` Trond Myklebust 0 siblings, 1 reply; 96+ messages in thread From: Steve Rago @ 2009-12-23 23:13 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach On Wed, 2009-12-23 at 22:49 +0100, Trond Myklebust wrote: > > When to send the commit is a complex question to answer. If you delay > > it long enough, the server's flusher threads will have already done most > > of the work for you, so commits can be cheap, but you don't have access > > to the necessary information to figure this out. You can't delay it too > > long, though, because the unstable pages on the client will grow too > > large, creating memory pressure. I have a second patch, which I haven't > > posted yet, that adds feedback piggy-backed on the NFS write response, > > which allows the NFS client to free pages proactively. This greatly > > reduces the need to send commit messages, but it extends the protocol > > (in a backward-compatible manner), so it could be hard to convince > > people to accept. > > There are only 2 cases when the client should send a COMMIT: > 1. When it hits a synchronisation point (i.e. when the user calls > f/sync(), or close(), or when the user sets/clears a file > lock). > 2. When memory pressure causes the VM to wants to free up those > pages that are marked as clean but unstable. > > We should never be sending COMMIT in any other situation, since that > would imply that the client somehow has better information on how to > manage dirty pages on the server than the server's own VM. > > Cheers > Trond #2 is the difficult one. If you wait for memory pressure, you could have waited too long, because depending on the latency of the commit, you could run into low-memory situations. Then mayhem ensues, the oom-killer gets cranky (if you haven't disabled it), and stuff starts failing and/or hanging. So you need to be careful about setting the threshold for generating a commit so that the client doesn't run out of memory before the server can respond. Steve ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 23:13 ` Steve Rago @ 2009-12-23 23:44 ` Trond Myklebust 2009-12-24 4:30 ` Steve Rago 0 siblings, 1 reply; 96+ messages in thread From: Trond Myklebust @ 2009-12-23 23:44 UTC (permalink / raw) To: Steve Rago Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach On Wed, 2009-12-23 at 18:13 -0500, Steve Rago wrote: > On Wed, 2009-12-23 at 22:49 +0100, Trond Myklebust wrote: > > > > When to send the commit is a complex question to answer. If you delay > > > it long enough, the server's flusher threads will have already done most > > > of the work for you, so commits can be cheap, but you don't have access > > > to the necessary information to figure this out. You can't delay it too > > > long, though, because the unstable pages on the client will grow too > > > large, creating memory pressure. I have a second patch, which I haven't > > > posted yet, that adds feedback piggy-backed on the NFS write response, > > > which allows the NFS client to free pages proactively. This greatly > > > reduces the need to send commit messages, but it extends the protocol > > > (in a backward-compatible manner), so it could be hard to convince > > > people to accept. > > > > There are only 2 cases when the client should send a COMMIT: > > 1. When it hits a synchronisation point (i.e. when the user calls > > f/sync(), or close(), or when the user sets/clears a file > > lock). > > 2. When memory pressure causes the VM to wants to free up those > > pages that are marked as clean but unstable. > > > > We should never be sending COMMIT in any other situation, since that > > would imply that the client somehow has better information on how to > > manage dirty pages on the server than the server's own VM. > > > > Cheers > > Trond > > #2 is the difficult one. If you wait for memory pressure, you could > have waited too long, because depending on the latency of the commit, > you could run into low-memory situations. Then mayhem ensues, the > oom-killer gets cranky (if you haven't disabled it), and stuff starts > failing and/or hanging. So you need to be careful about setting the > threshold for generating a commit so that the client doesn't run out of > memory before the server can respond. Right, but this is why we have limits on the total number of dirty pages that can be kept in memory. The NFS unstable writes don't significantly change that model, they just add an extra step: once all the dirty data has been transmitted to the server, your COMMIT defines a synchronisation point after which you know that the data you just sent is all on disk. Given a reasonable NFS server implementation, it will already have started the write out of that data, and so hopefully the COMMIT operation itself will run reasonably quickly. Any userland application with basic data integrity requirements will have the same expectations. It will write out the data and then fsync() at regular intervals. I've never heard of any expectations from filesystem and VM designers that applications should be required to fine-tune the length of those intervals in order to achieve decent performance. Cheers Trond ^ permalink raw reply [flat|nested] 96+ messages in thread
* Re: [PATCH] improve the performance of large sequential write NFS workloads 2009-12-23 23:44 ` Trond Myklebust @ 2009-12-24 4:30 ` Steve Rago 0 siblings, 0 replies; 96+ messages in thread From: Steve Rago @ 2009-12-24 4:30 UTC (permalink / raw) To: Trond Myklebust Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach On Thu, 2009-12-24 at 00:44 +0100, Trond Myklebust wrote: > > #2 is the difficult one. If you wait for memory pressure, you could > > have waited too long, because depending on the latency of the commit, > > you could run into low-memory situations. Then mayhem ensues, the > > oom-killer gets cranky (if you haven't disabled it), and stuff starts > > failing and/or hanging. So you need to be careful about setting the > > threshold for generating a commit so that the client doesn't run out of > > memory before the server can respond. > > Right, but this is why we have limits on the total number of dirty pages > that can be kept in memory. The NFS unstable writes don't significantly > change that model, they just add an extra step: once all the dirty data > has been transmitted to the server, your COMMIT defines a > synchronisation point after which you know that the data you just sent > is all on disk. Given a reasonable NFS server implementation, it will > already have started the write out of that data, and so hopefully the > COMMIT operation itself will run reasonably quickly. Right. The trick is to do this with the best performance possible. > > Any userland application with basic data integrity requirements will > have the same expectations. It will write out the data and then fsync() > at regular intervals. I've never heard of any expectations from > filesystem and VM designers that applications should be required to > fine-tune the length of those intervals in order to achieve decent > performance. Agreed, except that the more you call fsync(), the more you are stalling the writing, so application designers must use fsync() judiciously. Otherwise they'd just use synchronous writes. (Apologies if I sound like Captain Obvious.) Thanks, Steve > > Cheers > Trond > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 96+ messages in thread
end of thread, other threads:[~2010-01-08 14:15 UTC | newest]
Thread overview: 96+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-17 2:03 [PATCH] improve the performance of large sequential write NFS workloads Steve Rago
2009-12-17 8:17 ` Peter Zijlstra
2009-12-18 19:33 ` Steve Rago
2009-12-18 19:41 ` Ingo Molnar
2009-12-18 21:20 ` Steve Rago
2009-12-18 22:07 ` Ingo Molnar
2009-12-18 22:46 ` Steve Rago
2009-12-19 8:08 ` Arjan van de Ven
2009-12-19 13:37 ` Steve Rago
2009-12-18 19:44 ` Peter Zijlstra
2009-12-19 12:20 ` Wu Fengguang
2009-12-19 14:25 ` Steve Rago
2009-12-22 1:59 ` Wu Fengguang
2009-12-22 12:35 ` Jan Kara
[not found] ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
2009-12-23 8:43 ` Christoph Hellwig
2009-12-23 13:32 ` Jan Kara
[not found] ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-24 5:25 ` Wu Fengguang
2009-12-24 1:26 ` Wu Fengguang
2009-12-22 13:01 ` Martin Knoblauch
[not found] ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>
2009-12-24 1:46 ` Wu Fengguang
2009-12-22 16:41 ` Steve Rago
2009-12-24 1:21 ` Wu Fengguang
2009-12-24 14:49 ` Steve Rago
2009-12-25 7:37 ` Wu Fengguang
2009-12-23 14:21 ` Trond Myklebust
2009-12-23 18:05 ` Jan Kara
[not found] ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-23 19:12 ` Trond Myklebust
2009-12-24 2:52 ` Wu Fengguang
2009-12-24 12:04 ` Trond Myklebust
2009-12-25 5:56 ` Wu Fengguang
2009-12-30 16:22 ` Trond Myklebust
2009-12-31 5:04 ` Wu Fengguang
2009-12-31 19:13 ` Trond Myklebust
2010-01-06 3:03 ` Wu Fengguang
2010-01-06 16:56 ` Trond Myklebust
2010-01-06 18:26 ` Trond Myklebust
2010-01-06 18:37 ` Peter Zijlstra
2010-01-06 18:52 ` Trond Myklebust
2010-01-06 19:07 ` Peter Zijlstra
2010-01-06 19:21 ` Trond Myklebust
2010-01-06 19:53 ` Trond Myklebust
2010-01-06 20:09 ` Jan Kara
2010-01-06 20:51 ` [PATCH 0/6] " Trond Myklebust
[not found] ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-06 20:51 ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust
[not found] ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 2:34 ` Wu Fengguang
2010-01-06 20:51 ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-07 2:29 ` Wu Fengguang
2010-01-07 4:49 ` Trond Myklebust
2010-01-07 5:03 ` Wu Fengguang
2010-01-07 5:30 ` Trond Myklebust
2010-01-07 14:37 ` Wu Fengguang
2010-01-07 14:41 ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
[not found] ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 14:41 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
2010-01-07 14:41 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07 14:41 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
2010-01-07 14:41 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
2010-01-07 14:41 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-06 20:51 ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
[not found] ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-06 21:38 ` Jan Kara
[not found] ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2010-01-06 21:48 ` Trond Myklebust
2010-01-07 2:18 ` Wu Fengguang
[not found] ` <1262839082.2185.15.camel@localhost>
2010-01-07 4:48 ` Wu Fengguang
2010-01-07 4:53 ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
[not found] ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 4:53 ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07 4:53 ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-07 4:53 ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
2010-01-07 4:53 ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
2010-01-07 4:53 ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
2010-01-07 14:56 ` [PATCH 1/6] " Wu Fengguang
2010-01-07 15:10 ` Trond Myklebust
2010-01-08 1:17 ` Wu Fengguang
2010-01-08 1:37 ` Trond Myklebust
2010-01-08 1:53 ` Wu Fengguang
2010-01-08 9:25 ` Christoph Hellwig
2010-01-08 13:46 ` Trond Myklebust
2010-01-08 13:54 ` Christoph Hellwig
2010-01-08 14:15 ` Trond Myklebust
2010-01-06 20:51 ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
[not found] ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 2:32 ` Wu Fengguang
2010-01-06 20:51 ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07 1:56 ` Wu Fengguang
2010-01-06 20:51 ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
[not found] ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 1:48 ` Wu Fengguang
2010-01-06 21:44 ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara
2010-01-06 22:03 ` Trond Myklebust
2010-01-07 8:16 ` Peter Zijlstra
2009-12-22 12:25 ` Jan Kara
[not found] ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
2009-12-22 12:38 ` Peter Zijlstra
2009-12-22 12:55 ` Jan Kara
2009-12-22 16:20 ` Steve Rago
2009-12-23 18:39 ` Jan Kara
[not found] ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-23 20:16 ` Steve Rago
2009-12-23 21:49 ` Trond Myklebust
2009-12-23 23:13 ` Steve Rago
2009-12-23 23:44 ` Trond Myklebust
2009-12-24 4:30 ` Steve Rago
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).