[PATCH v3] writeback: Do not sync data dirtied after sync start

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3] writeback: Do not sync data dirtied after sync start
@ 2013-09-27  9:44 Jan Kara
  2013-09-29 23:12 ` Dave Chinner
  2013-10-08 22:14 ` Andrew Morton
  0 siblings, 2 replies; 7+ messages in thread
From: Jan Kara @ 2013-09-27  9:44 UTC (permalink / raw)
  To: Al Viro; +Cc: Wu Fengguang, linux-fsdevel, dchinner, Jan Kara

When there are processes heavily creating small files while sync(2) is
running, it can easily happen that quite some new files are created
between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
especially if there are several busy filesystems (remember that sync
traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
(e.g. causes a transaction commit and cache flush for each inode in
ext3), resulting sync(2) times are rather large.

The following script reproduces the problem:

function run_writers
{
  for (( i = 0; i < 10; i++ )); do
    mkdir $1/dir$i
    for (( j = 0; j < 40000; j++ )); do
      dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
    done &
  done
}

for dir in "$@"; do
  run_writers $dir
done

sleep 40
time sync
======

Fix the problem by disregarding inodes dirtied after sync(2) was called
in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
time stamp when sync has started which is used for setting up work for
flusher threads.

To give some numbers, when above script is run on two ext4 filesystems on
simple SATA drive, the average sync time from 10 runs is 267.549 seconds
with standard deviation 104.799426. With the patched kernel, the average
sync time from 10 runs is 2.995 seconds with standard deviation 0.096.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c                | 17 ++++++++---------
 fs/sync.c                        | 15 +++++++++------
 fs/xfs/xfs_super.c               |  2 +-
 include/linux/writeback.h        |  2 +-
 include/trace/events/writeback.h |  6 +++---
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f4935b..70837da 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -39,7 +39,7 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
-	unsigned long *older_than_this;
+	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
@@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (work->older_than_this &&
-		    inode_dirtied_after(inode, *work->older_than_this))
+		if (inode_dirtied_after(inode, work->older_than_this))
 			break;
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
@@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
 	unsigned long wb_start = jiffies;
 	long nr_pages = work->nr_pages;
-	unsigned long oldest_jif;
 	struct inode *inode;
 	long progress;
 
-	oldest_jif = jiffies;
-	work->older_than_this = &oldest_jif;
+	if (!work->older_than_this)
+		work->older_than_this = jiffies;
 
 	spin_lock(&wb->list_lock);
 	for (;;) {
@@ -830,10 +828,10 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * safe.
 		 */
 		if (work->for_kupdate) {
-			oldest_jif = jiffies -
+			work->older_than_this = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
 		} else if (work->for_background)
-			oldest_jif = jiffies;
+			work->older_than_this = jiffies;
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
@@ -1350,13 +1348,14 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
+		.older_than_this = older_than_this,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
diff --git a/fs/sync.c b/fs/sync.c
index 905f3f6..ff96f99 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,10 +27,11 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait,
+			     unsigned long start)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, start);
 	else
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 
@@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 int sync_filesystem(struct super_block *sb)
 {
 	int ret;
+	unsigned long start = jiffies;
 
 	/*
 	 * We need to be protected against the filesystem going from
@@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, 0, start);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, 1, start);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
 	if (!(sb->s_flags & MS_RDONLY))
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, *((unsigned long *)arg));
 }
 
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
+	unsigned long start = jiffies;
 
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &start);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc..8968f50 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,7 +918,7 @@ xfs_flush_inodes(
 	struct super_block	*sb = mp->m_super;
 
 	if (down_read_trylock(&sb->s_umount)) {
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, jiffies);
 		up_read(&sb->s_umount);
 	}
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 021b8a3..fc0e432 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
-void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 464ea82..c7bbbe7 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -287,11 +287,11 @@ TRACE_EVENT(writeback_queue_io,
 		__field(int,		reason)
 	),
 	TP_fast_assign(
-		unsigned long *older_than_this = work->older_than_this;
+		unsigned long older_than_this = work->older_than_this;
 		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
-		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->older	= older_than_this;
 		__entry->age	= older_than_this ?
-				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+				  (jiffies - older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
 		__entry->reason	= work->reason;
 	),
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-09-27  9:44 [PATCH v3] writeback: Do not sync data dirtied after sync start Jan Kara
@ 2013-09-29 23:12 ` Dave Chinner
  2013-10-08 22:14 ` Andrew Morton
  1 sibling, 0 replies; 7+ messages in thread
From: Dave Chinner @ 2013-09-29 23:12 UTC (permalink / raw)
  To: Jan Kara; +Cc: Al Viro, Wu Fengguang, linux-fsdevel, dchinner

On Fri, Sep 27, 2013 at 11:44:40AM +0200, Jan Kara wrote:
> When there are processes heavily creating small files while sync(2) is
> running, it can easily happen that quite some new files are created
> between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> especially if there are several busy filesystems (remember that sync
> traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> (e.g. causes a transaction commit and cache flush for each inode in
> ext3), resulting sync(2) times are rather large.
> 
> The following script reproduces the problem:
> 
> function run_writers
> {
>   for (( i = 0; i < 10; i++ )); do
>     mkdir $1/dir$i
>     for (( j = 0; j < 40000; j++ )); do
>       dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
>     done &
>   done
> }
> 
> for dir in "$@"; do
>   run_writers $dir
> done
> 
> sleep 40
> time sync
> ======
> 
> Fix the problem by disregarding inodes dirtied after sync(2) was called
> in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> time stamp when sync has started which is used for setting up work for
> flusher threads.
> 
> To give some numbers, when above script is run on two ext4 filesystems on
> simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> with standard deviation 104.799426. With the patched kernel, the average
> sync time from 10 runs is 2.995 seconds with standard deviation 0.096.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

This version looks good.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-09-27  9:44 [PATCH v3] writeback: Do not sync data dirtied after sync start Jan Kara
  2013-09-29 23:12 ` Dave Chinner
@ 2013-10-08 22:14 ` Andrew Morton
  2013-10-09 14:02   ` Jan Kara
  1 sibling, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2013-10-08 22:14 UTC (permalink / raw)
  To: Jan Kara; +Cc: Al Viro, Wu Fengguang, linux-fsdevel, dchinner

On Fri, 27 Sep 2013 11:44:40 +0200 Jan Kara <jack@suse.cz> wrote:

> When there are processes heavily creating small files while sync(2) is
> running, it can easily happen that quite some new files are created
> between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> especially if there are several busy filesystems (remember that sync
> traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> (e.g. causes a transaction commit and cache flush for each inode in
> ext3), resulting sync(2) times are rather large.
> 
> The following script reproduces the problem:
> 
> function run_writers
> {
>   for (( i = 0; i < 10; i++ )); do
>     mkdir $1/dir$i
>     for (( j = 0; j < 40000; j++ )); do
>       dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
>     done &
>   done
> }
> 
> for dir in "$@"; do
>   run_writers $dir
> done
> 
> sleep 40
> time sync
> ======
> 
> Fix the problem by disregarding inodes dirtied after sync(2) was called
> in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> time stamp when sync has started which is used for setting up work for
> flusher threads.
> 
> To give some numbers, when above script is run on two ext4 filesystems on
> simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> with standard deviation 104.799426. With the patched kernel, the average
> sync time from 10 runs is 2.995 seconds with standard deviation 0.096.

We need to be really careful about this - it's easy to make mistakes
and the consequences are nasty.

> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -39,7 +39,7 @@
>  struct wb_writeback_work {
>  	long nr_pages;
>  	struct super_block *sb;
> -	unsigned long *older_than_this;
> +	unsigned long older_than_this;
>  	enum writeback_sync_modes sync_mode;
>  	unsigned int tagged_writepages:1;
>  	unsigned int for_kupdate:1;
> @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>  
>  	while (!list_empty(delaying_queue)) {
>  		inode = wb_inode(delaying_queue->prev);
> -		if (work->older_than_this &&
> -		    inode_dirtied_after(inode, *work->older_than_this))
> +		if (inode_dirtied_after(inode, work->older_than_this))
>  			break;
>  		list_move(&inode->i_wb_list, &tmp);
>  		moved++;
> @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
>  {
>  	unsigned long wb_start = jiffies;
>  	long nr_pages = work->nr_pages;
> -	unsigned long oldest_jif;
>  	struct inode *inode;
>  	long progress;
>  
> -	oldest_jif = jiffies;
> -	work->older_than_this = &oldest_jif;
> +	if (!work->older_than_this)
> +		work->older_than_this = jiffies;

So wb_writeback_work.older_than_this==0 has special (and undocumented!)
meaning.  But 0 is a valid jiffies value (it occurs 5 minutes after
boot, too).  What happens?

If the caller passed in "jiffies" at that time, things will presumably
work, by luck, because we'll overwrite the caller's zero with another
zero.  Most of the time - things might go wrong if jiffies increments
to 1.

But what happens if the caller was kupdate, exactly 330 seconds after
boot?  Won't we overwrite the caller's "older than 330 seconds" with
"older than 300 seconds" (or something like that)?

If this has all been thought through then let's explain how it works,
please.

Perhaps it would be better to just stop using the
wb_writeback_work.older_than_this==0 magic sentinel and add a new
older_than_this_is_set:1 to the wb_writeback_work.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-10-08 22:14 ` Andrew Morton
@ 2013-10-09 14:02   ` Jan Kara
  2013-10-09 15:03     ` Jan Kara
  0 siblings, 1 reply; 7+ messages in thread
From: Jan Kara @ 2013-10-09 14:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, Al Viro, Wu Fengguang, linux-fsdevel, dchinner

[-- Attachment #1: Type: text/plain, Size: 4694 bytes --]

On Tue 08-10-13 15:14:09, Andrew Morton wrote:
> On Fri, 27 Sep 2013 11:44:40 +0200 Jan Kara <jack@suse.cz> wrote:
> 
> > When there are processes heavily creating small files while sync(2) is
> > running, it can easily happen that quite some new files are created
> > between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> > especially if there are several busy filesystems (remember that sync
> > traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> > fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> > (e.g. causes a transaction commit and cache flush for each inode in
> > ext3), resulting sync(2) times are rather large.
> > 
> > The following script reproduces the problem:
> > 
> > function run_writers
> > {
> >   for (( i = 0; i < 10; i++ )); do
> >     mkdir $1/dir$i
> >     for (( j = 0; j < 40000; j++ )); do
> >       dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
> >     done &
> >   done
> > }
> > 
> > for dir in "$@"; do
> >   run_writers $dir
> > done
> > 
> > sleep 40
> > time sync
> > ======
> > 
> > Fix the problem by disregarding inodes dirtied after sync(2) was called
> > in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> > time stamp when sync has started which is used for setting up work for
> > flusher threads.
> > 
> > To give some numbers, when above script is run on two ext4 filesystems on
> > simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> > with standard deviation 104.799426. With the patched kernel, the average
> > sync time from 10 runs is 2.995 seconds with standard deviation 0.096.
> 
> We need to be really careful about this - it's easy to make mistakes
> and the consequences are nasty.
  Agreed. Nasty and hard to notice.

> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -39,7 +39,7 @@
> >  struct wb_writeback_work {
> >  	long nr_pages;
> >  	struct super_block *sb;
> > -	unsigned long *older_than_this;
> > +	unsigned long older_than_this;
> >  	enum writeback_sync_modes sync_mode;
> >  	unsigned int tagged_writepages:1;
> >  	unsigned int for_kupdate:1;
> > @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
> >  
> >  	while (!list_empty(delaying_queue)) {
> >  		inode = wb_inode(delaying_queue->prev);
> > -		if (work->older_than_this &&
> > -		    inode_dirtied_after(inode, *work->older_than_this))
> > +		if (inode_dirtied_after(inode, work->older_than_this))
> >  			break;
> >  		list_move(&inode->i_wb_list, &tmp);
> >  		moved++;
> > @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
> >  {
> >  	unsigned long wb_start = jiffies;
> >  	long nr_pages = work->nr_pages;
> > -	unsigned long oldest_jif;
> >  	struct inode *inode;
> >  	long progress;
> >  
> > -	oldest_jif = jiffies;
> > -	work->older_than_this = &oldest_jif;
> > +	if (!work->older_than_this)
> > +		work->older_than_this = jiffies;
> 
> So wb_writeback_work.older_than_this==0 has special (and undocumented!)
> meaning.  But 0 is a valid jiffies value (it occurs 5 minutes after
> boot, too).  What happens?
> 
> If the caller passed in "jiffies" at that time, things will presumably
> work, by luck, because we'll overwrite the caller's zero with another
> zero.  Most of the time - things might go wrong if jiffies increments
> to 1.
> 
> But what happens if the caller was kupdate, exactly 330 seconds after
> boot?  Won't we overwrite the caller's "older than 330 seconds" with
> "older than 300 seconds" (or something like that)?
> 
> If this has all been thought through then let's explain how it works,
> please.
  Yes, I was thinking about this and consequences seemed harmless to me. If
the submitter of 'work' sets older_than_this but by coincidence it ends up
being 0, we reset older_than_this to 'jiffies' in wb_writeback(). That can
result in writing more than we strictly need but that doesn't cause any
harm (except for possibly slower execution of that work item).

> Perhaps it would be better to just stop using the
> wb_writeback_work.older_than_this==0 magic sentinel and add a new
> older_than_this_is_set:1 to the wb_writeback_work.
  You are right it would be a cleaner solution. OTOH it creates a
possibility for someone to set older_than_this but forget to set
older_than_this_is_set. But all users are localized in fs/fs-writeback.c
and we don't create new users that often. So I guess it's OK.

Attached is a patch which implements what you've asked for. Either fold it
into my previous patch or carry it separately. I don't really mind. Thanks
for having a look at my patch.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

[-- Attachment #2: 0001-writeback-Use-older_than_this_is_set-instead-of-magi.patch --]
[-- Type: text/x-patch, Size: 1935 bytes --]

>From c1c60bd8e655a7db872473ad7436d957e3a7d5fd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 9 Oct 2013 15:41:50 +0200
Subject: [PATCH] writeback: Use older_than_this_is_set instead of magic
 older_than_this == 0

Currently we use 0 as a special value of work->older_than_this to
indicate that wb_writeback() should set work->older_that_this to current
time. This works but it is a bit magic. So use a special flag in
work_struct for that.

Also fixup writeback from workqueue rescuer to include all inodes.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 70837dadad72..f3871e5c61aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -46,6 +46,7 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -732,6 +733,8 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -793,7 +796,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	struct inode *inode;
 	long progress;
 
-	if (!work->older_than_this)
+	if (!work->older_than_this_is_set)
 		work->older_than_this = jiffies;
 
 	spin_lock(&wb->list_lock);
@@ -1356,6 +1359,7 @@ void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-10-09 14:02   ` Jan Kara
@ 2013-10-09 15:03     ` Jan Kara
  2013-10-09 21:21       ` Andrew Morton
  0 siblings, 1 reply; 7+ messages in thread
From: Jan Kara @ 2013-10-09 15:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, Al Viro, Wu Fengguang, linux-fsdevel, dchinner

[-- Attachment #1: Type: text/plain, Size: 5093 bytes --]

On Wed 09-10-13 16:02:11, Jan Kara wrote:
> On Tue 08-10-13 15:14:09, Andrew Morton wrote:
> > On Fri, 27 Sep 2013 11:44:40 +0200 Jan Kara <jack@suse.cz> wrote:
> > 
> > > When there are processes heavily creating small files while sync(2) is
> > > running, it can easily happen that quite some new files are created
> > > between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> > > especially if there are several busy filesystems (remember that sync
> > > traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> > > fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> > > (e.g. causes a transaction commit and cache flush for each inode in
> > > ext3), resulting sync(2) times are rather large.
> > > 
> > > The following script reproduces the problem:
> > > 
> > > function run_writers
> > > {
> > >   for (( i = 0; i < 10; i++ )); do
> > >     mkdir $1/dir$i
> > >     for (( j = 0; j < 40000; j++ )); do
> > >       dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
> > >     done &
> > >   done
> > > }
> > > 
> > > for dir in "$@"; do
> > >   run_writers $dir
> > > done
> > > 
> > > sleep 40
> > > time sync
> > > ======
> > > 
> > > Fix the problem by disregarding inodes dirtied after sync(2) was called
> > > in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> > > time stamp when sync has started which is used for setting up work for
> > > flusher threads.
> > > 
> > > To give some numbers, when above script is run on two ext4 filesystems on
> > > simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> > > with standard deviation 104.799426. With the patched kernel, the average
> > > sync time from 10 runs is 2.995 seconds with standard deviation 0.096.
> > 
> > We need to be really careful about this - it's easy to make mistakes
> > and the consequences are nasty.
>   Agreed. Nasty and hard to notice.
> 
> > > --- a/fs/fs-writeback.c
> > > +++ b/fs/fs-writeback.c
> > > @@ -39,7 +39,7 @@
> > >  struct wb_writeback_work {
> > >  	long nr_pages;
> > >  	struct super_block *sb;
> > > -	unsigned long *older_than_this;
> > > +	unsigned long older_than_this;
> > >  	enum writeback_sync_modes sync_mode;
> > >  	unsigned int tagged_writepages:1;
> > >  	unsigned int for_kupdate:1;
> > > @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
> > >  
> > >  	while (!list_empty(delaying_queue)) {
> > >  		inode = wb_inode(delaying_queue->prev);
> > > -		if (work->older_than_this &&
> > > -		    inode_dirtied_after(inode, *work->older_than_this))
> > > +		if (inode_dirtied_after(inode, work->older_than_this))
> > >  			break;
> > >  		list_move(&inode->i_wb_list, &tmp);
> > >  		moved++;
> > > @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
> > >  {
> > >  	unsigned long wb_start = jiffies;
> > >  	long nr_pages = work->nr_pages;
> > > -	unsigned long oldest_jif;
> > >  	struct inode *inode;
> > >  	long progress;
> > >  
> > > -	oldest_jif = jiffies;
> > > -	work->older_than_this = &oldest_jif;
> > > +	if (!work->older_than_this)
> > > +		work->older_than_this = jiffies;
> > 
> > So wb_writeback_work.older_than_this==0 has special (and undocumented!)
> > meaning.  But 0 is a valid jiffies value (it occurs 5 minutes after
> > boot, too).  What happens?
> > 
> > If the caller passed in "jiffies" at that time, things will presumably
> > work, by luck, because we'll overwrite the caller's zero with another
> > zero.  Most of the time - things might go wrong if jiffies increments
> > to 1.
> > 
> > But what happens if the caller was kupdate, exactly 330 seconds after
> > boot?  Won't we overwrite the caller's "older than 330 seconds" with
> > "older than 300 seconds" (or something like that)?
> > 
> > If this has all been thought through then let's explain how it works,
> > please.
>   Yes, I was thinking about this and consequences seemed harmless to me. If
> the submitter of 'work' sets older_than_this but by coincidence it ends up
> being 0, we reset older_than_this to 'jiffies' in wb_writeback(). That can
> result in writing more than we strictly need but that doesn't cause any
> harm (except for possibly slower execution of that work item).
> 
> > Perhaps it would be better to just stop using the
> > wb_writeback_work.older_than_this==0 magic sentinel and add a new
> > older_than_this_is_set:1 to the wb_writeback_work.
>   You are right it would be a cleaner solution. OTOH it creates a
> possibility for someone to set older_than_this but forget to set
> older_than_this_is_set. But all users are localized in fs/fs-writeback.c
> and we don't create new users that often. So I guess it's OK.
> 
> Attached is a patch which implements what you've asked for. Either fold it
> into my previous patch or carry it separately. I don't really mind. Thanks
> for having a look at my patch.
  I found out I've sent out an older version of the patch without a comment
in struct wb_writeback_work. So here's a newer version.

								Honza

-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

[-- Attachment #2: 0001-writeback-Use-older_than_this_is_set-instead-of-magi.patch --]
[-- Type: text/x-patch, Size: 2265 bytes --]

>From 27d1017e2f7dd0e4a40f9ff38926a6a11cdd5cfb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 9 Oct 2013 15:41:50 +0200
Subject: [PATCH] writeback: Use older_than_this_is_set instead of magic
 older_than_this == 0

Currently we use 0 as a special value of work->older_than_this to
indicate that wb_writeback() should set work->older_that_this to current
time. This works but it is a bit magic. So use a special flag in
work_struct for that.

Also fixup writeback from workqueue rescuer to include all inodes.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 70837dadad72..65e66caec76f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -39,6 +39,10 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	/*
+	 * Write only inodes dirtied before this time. Don't forget to set
+	 * older_than_this_is_set when you set this.
+	 */
 	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
@@ -46,6 +50,7 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -732,6 +737,8 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -793,7 +800,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	struct inode *inode;
 	long progress;
 
-	if (!work->older_than_this)
+	if (!work->older_than_this_is_set)
 		work->older_than_this = jiffies;
 
 	spin_lock(&wb->list_lock);
@@ -1356,6 +1363,7 @@ void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-10-09 15:03     ` Jan Kara
@ 2013-10-09 21:21       ` Andrew Morton
  2013-10-09 22:14         ` Jan Kara
  0 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2013-10-09 21:21 UTC (permalink / raw)
  To: Jan Kara; +Cc: Al Viro, Wu Fengguang, linux-fsdevel, dchinner

On Wed, 9 Oct 2013 17:03:25 +0200 Jan Kara <jack@suse.cz> wrote:

> From: Jan Kara <jack@suse.cz>
> Date: Wed, 9 Oct 2013 15:41:50 +0200
> Subject: [PATCH] writeback: Use older_than_this_is_set instead of magic
>  older_than_this == 0
> 
> Currently we use 0 as a special value of work->older_than_this to
> indicate that wb_writeback() should set work->older_that_this to current
> time. This works but it is a bit magic. So use a special flag in
> work_struct for that.

OK.

> -	if (!work->older_than_this)
> +	if (!work->older_than_this_is_set)
> 		work->older_than_this = jiffies;

It would be logical although presumably unneeded to set
older_than_this_is_set here?

> Also fixup writeback from workqueue rescuer to include all inodes.

There's nothing in the patch which matches this sentence?


From: Jan Kara <jack@suse.cz>
Subject: writeback: use older_than_this_is_set instead of magic older_than_this == 0

Currently we use 0 as a special value of work->older_than_this to
indicate that wb_writeback() should set work->older_that_this to current
time. This works but it is a bit magic. So use a special flag in
work_struct for that.

Also fixup writeback from workqueue rescuer to include all inodes.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/fs-writeback.c |   10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff -puN fs/fs-writeback.c~writeback-do-not-sync-data-dirtied-after-sync-start-fix fs/fs-writeback.c
--- a/fs/fs-writeback.c~writeback-do-not-sync-data-dirtied-after-sync-start-fix
+++ a/fs/fs-writeback.c
@@ -39,6 +39,10 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	/*
+	 * Write only inodes dirtied before this time. Don't forget to set
+	 * older_than_this_is_set when you set this.
+	 */
 	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
@@ -46,6 +50,7 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -732,6 +737,8 @@ static long writeback_inodes_wb(struct b
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -793,7 +800,7 @@ static long wb_writeback(struct bdi_writ
 	struct inode *inode;
 	long progress;
 
-	if (!work->older_than_this)
+	if (!work->older_than_this_is_set)
 		work->older_than_this = jiffies;
 
 	spin_lock(&wb->list_lock);
@@ -1356,6 +1363,7 @@ void sync_inodes_sb(struct super_block *
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
_


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] writeback: Do not sync data dirtied after sync start
  2013-10-09 21:21       ` Andrew Morton
@ 2013-10-09 22:14         ` Jan Kara
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-10-09 22:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Kara, Al Viro, Wu Fengguang, linux-fsdevel, dchinner

[-- Attachment #1: Type: text/plain, Size: 1767 bytes --]

On Wed 09-10-13 14:21:25, Andrew Morton wrote:
> On Wed, 9 Oct 2013 17:03:25 +0200 Jan Kara <jack@suse.cz> wrote:
> 
> > From: Jan Kara <jack@suse.cz>
> > Date: Wed, 9 Oct 2013 15:41:50 +0200
> > Subject: [PATCH] writeback: Use older_than_this_is_set instead of magic
> >  older_than_this == 0
> > 
> > Currently we use 0 as a special value of work->older_than_this to
> > indicate that wb_writeback() should set work->older_that_this to current
> > time. This works but it is a bit magic. So use a special flag in
> > work_struct for that.
> 
> OK.
> 
> > -	if (!work->older_than_this)
> > +	if (!work->older_than_this_is_set)
> > 		work->older_than_this = jiffies;
> 
> It would be logical although presumably unneeded to set
> older_than_this_is_set here?
  Yes. Updated.

> > Also fixup writeback from workqueue rescuer to include all inodes.
> 
> There's nothing in the patch which matches this sentence?
  The sentence is about the hunk below. writeback_inodes_wb() is special in
that it directly calls queue_io() (everything else goes through
wb_writeback()) and my previous patch thus resulted in using 0 as an
older_than_this value => likely we wouldn't queue any inodes for writeback.

I've added WARN_ON_ONCE into move_expired_inodes() to increase a chance of
catching such mistakes in future (although in this particular case it
wouldn't really help because writeback_inodes_wb() gets hardly ever
called).

> @@ -732,6 +737,8 @@ static long writeback_inodes_wb(struct b
>  		.sync_mode	= WB_SYNC_NONE,
>  		.range_cyclic	= 1,
>  		.reason		= reason,
> +		.older_than_this = jiffies,
> +		.older_than_this_is_set = 1,
>  	};
>  
>  	spin_lock(&wb->list_lock);

Updated version of the patch attached.

							Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

[-- Attachment #2: 0001-writeback-Use-older_than_this_is_set-instead-of-magi.patch --]
[-- Type: text/x-patch, Size: 2782 bytes --]

>From a872d2374364ac1d8f0a9a6fa529afdc06f9d1a1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 9 Oct 2013 15:41:50 +0200
Subject: [PATCH] writeback: Use older_than_this_is_set instead of magic
 older_than_this == 0

Currently we use 0 as a special value of work->older_than_this to
indicate that wb_writeback() should set work->older_that_this to current
time. This works but it is a bit magic. So use a special flag in
work_struct for that.

Also fixup writeback from workqueue rescuer (writeback_inodes_wb()) to
include all inodes. Currently it would use 0 as an older_than_this value
thus queue_io() would likely not queue any inodes for writeback.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 70837dadad72..ba332d2c596a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -39,6 +39,10 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	/*
+	 * Write only inodes dirtied before this time. Don't forget to set
+	 * older_than_this_is_set when you set this.
+	 */
 	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
@@ -46,6 +50,7 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -246,6 +251,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	WARN_ON_ONCE(!work->older_than_this_is_set);
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
 		if (inode_dirtied_after(inode, work->older_than_this))
@@ -732,6 +738,8 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -793,8 +801,10 @@ static long wb_writeback(struct bdi_writeback *wb,
 	struct inode *inode;
 	long progress;
 
-	if (!work->older_than_this)
+	if (!work->older_than_this_is_set) {
 		work->older_than_this = jiffies;
+		work->older_than_this_is_set = 1;
+	}
 
 	spin_lock(&wb->list_lock);
 	for (;;) {
@@ -1356,6 +1366,7 @@ void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-10-09 22:14 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-27  9:44 [PATCH v3] writeback: Do not sync data dirtied after sync start Jan Kara
2013-09-29 23:12 ` Dave Chinner
2013-10-08 22:14 ` Andrew Morton
2013-10-09 14:02   ` Jan Kara
2013-10-09 15:03     ` Jan Kara
2013-10-09 21:21       ` Andrew Morton
2013-10-09 22:14         ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).