From: Jan Kara <jack@suse.cz>
To: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Wu Fengguang <fengguang.wu@intel.com>,
linux-fsdevel@vger.kernel.org, Jan Kara <jack@suse.cz>
Subject: Re: [PATCH] writeback: Do not sync data dirtied after sync start
Date: Thu, 26 Sep 2013 21:23:53 +0200 [thread overview]
Message-ID: <20130926192353.GC21811@quack.suse.cz> (raw)
In-Reply-To: <1380222306-25951-1-git-send-email-jack@suse.cz>
On Thu 26-09-13 21:05:06, Jan Kara wrote:
> When there are processes heavily creating small files while sync(2) is
> running, it can easily happen that quite some new files are created
> between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> especially if there are several busy filesystems (remember that sync
> traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> (e.g. causes a transaction commit and cache flush for each inode in
> ext3), resulting sync(2) times are rather large.
>
> The following script reproduces the problem:
> ---
I've realized this '---' is going to screw up git-am. So I'll resend with
a different separator. Sorry for the noise.
Honza
> function run_writers
> {
> for (( i = 0; i < 10; i++ )); do
> mkdir $1/dir$i
> for (( j = 0; j < 40000; j++ )); do
> dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
> done &
> done
> }
>
> for dir in "$@"; do
> run_writers $dir
> done
>
> sleep 40
> time sync
> ----
>
> Fix the problem by disregarding inodes dirtied after sync(2) was called
> in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> time stamp when sync has started which is used for setting up work for
> flusher threads.
>
> To give some numbers, when above script is run on two ext4 filesystems on
> simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> with standard deviation 104.799426. With the patched kernel, the average
> sync time from 10 runs is 2.995 seconds with standard deviation 0.096.
>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
> fs/fs-writeback.c | 17 ++++++++---------
> fs/sync.c | 15 +++++++++------
> fs/xfs/xfs_super.c | 2 +-
> include/linux/writeback.h | 2 +-
> 4 files changed, 19 insertions(+), 17 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 9f4935b..70837da 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -39,7 +39,7 @@
> struct wb_writeback_work {
> long nr_pages;
> struct super_block *sb;
> - unsigned long *older_than_this;
> + unsigned long older_than_this;
> enum writeback_sync_modes sync_mode;
> unsigned int tagged_writepages:1;
> unsigned int for_kupdate:1;
> @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>
> while (!list_empty(delaying_queue)) {
> inode = wb_inode(delaying_queue->prev);
> - if (work->older_than_this &&
> - inode_dirtied_after(inode, *work->older_than_this))
> + if (inode_dirtied_after(inode, work->older_than_this))
> break;
> list_move(&inode->i_wb_list, &tmp);
> moved++;
> @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
> {
> unsigned long wb_start = jiffies;
> long nr_pages = work->nr_pages;
> - unsigned long oldest_jif;
> struct inode *inode;
> long progress;
>
> - oldest_jif = jiffies;
> - work->older_than_this = &oldest_jif;
> + if (!work->older_than_this)
> + work->older_than_this = jiffies;
>
> spin_lock(&wb->list_lock);
> for (;;) {
> @@ -830,10 +828,10 @@ static long wb_writeback(struct bdi_writeback *wb,
> * safe.
> */
> if (work->for_kupdate) {
> - oldest_jif = jiffies -
> + work->older_than_this = jiffies -
> msecs_to_jiffies(dirty_expire_interval * 10);
> } else if (work->for_background)
> - oldest_jif = jiffies;
> + work->older_than_this = jiffies;
>
> trace_writeback_start(wb->bdi, work);
> if (list_empty(&wb->b_io))
> @@ -1350,13 +1348,14 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
> * This function writes and waits on any dirty inode belonging to this
> * super_block.
> */
> -void sync_inodes_sb(struct super_block *sb)
> +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
> {
> DECLARE_COMPLETION_ONSTACK(done);
> struct wb_writeback_work work = {
> .sb = sb,
> .sync_mode = WB_SYNC_ALL,
> .nr_pages = LONG_MAX,
> + .older_than_this = older_than_this,
> .range_cyclic = 0,
> .done = &done,
> .reason = WB_REASON_SYNC,
> diff --git a/fs/sync.c b/fs/sync.c
> index 905f3f6..ff96f99 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -27,10 +27,11 @@
> * wait == 1 case since in that case write_inode() functions do
> * sync_dirty_buffer() and thus effectively write one block at a time.
> */
> -static int __sync_filesystem(struct super_block *sb, int wait)
> +static int __sync_filesystem(struct super_block *sb, int wait,
> + unsigned long start)
> {
> if (wait)
> - sync_inodes_sb(sb);
> + sync_inodes_sb(sb, start);
> else
> writeback_inodes_sb(sb, WB_REASON_SYNC);
>
> @@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
> int sync_filesystem(struct super_block *sb)
> {
> int ret;
> + unsigned long start = jiffies;
>
> /*
> * We need to be protected against the filesystem going from
> @@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
> if (sb->s_flags & MS_RDONLY)
> return 0;
>
> - ret = __sync_filesystem(sb, 0);
> + ret = __sync_filesystem(sb, 0, start);
> if (ret < 0)
> return ret;
> - return __sync_filesystem(sb, 1);
> + return __sync_filesystem(sb, 1, start);
> }
> EXPORT_SYMBOL_GPL(sync_filesystem);
>
> static void sync_inodes_one_sb(struct super_block *sb, void *arg)
> {
> if (!(sb->s_flags & MS_RDONLY))
> - sync_inodes_sb(sb);
> + sync_inodes_sb(sb, *((unsigned long *)arg));
> }
>
> static void sync_fs_one_sb(struct super_block *sb, void *arg)
> @@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> SYSCALL_DEFINE0(sync)
> {
> int nowait = 0, wait = 1;
> + unsigned long start = jiffies;
>
> wakeup_flusher_threads(0, WB_REASON_SYNC);
> - iterate_supers(sync_inodes_one_sb, NULL);
> + iterate_supers(sync_inodes_one_sb, &start);
> iterate_supers(sync_fs_one_sb, &nowait);
> iterate_supers(sync_fs_one_sb, &wait);
> iterate_bdevs(fdatawrite_one_bdev, NULL);
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 15188cc..8968f50 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -918,7 +918,7 @@ xfs_flush_inodes(
> struct super_block *sb = mp->m_super;
>
> if (down_read_trylock(&sb->s_umount)) {
> - sync_inodes_sb(sb);
> + sync_inodes_sb(sb, jiffies);
> up_read(&sb->s_umount);
> }
> }
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 021b8a3..fc0e432 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
> int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
> int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
> enum wb_reason reason);
> -void sync_inodes_sb(struct super_block *);
> +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
> void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
> void inode_wait_for_writeback(struct inode *inode);
>
> --
> 1.8.1.4
>
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
prev parent reply other threads:[~2013-09-26 19:23 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-09-26 19:05 [PATCH] writeback: Do not sync data dirtied after sync start Jan Kara
2013-09-26 19:23 ` Jan Kara [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130926192353.GC21811@quack.suse.cz \
--to=jack@suse.cz \
--cc=fengguang.wu@intel.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=viro@ZenIV.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).