[PATCH] writeback: fix race that cause writeback hung

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] writeback: fix race that cause writeback hung
@ 2013-08-16  8:57 Junxiao Bi
  2013-08-16 13:53 ` Jan Kara
  0 siblings, 1 reply; 3+ messages in thread
From: Junxiao Bi @ 2013-08-16  8:57 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: fengguang.wu

There is a race between mark inode dirty and writeback thread,
see the following scenario. In this case, writeback thread will
not run though there is dirty_io.

__mark_inode_dirty()                                          bdi_writeback_workfn()
	...                                                       	...
	spin_lock(&inode->i_lock);
	...
	if (bdi_cap_writeback_dirty(bdi)) {
	    <<< assume wb has dirty_io, so wakeup_bdi is false.
	    <<< the following inode_dirty also have wakeup_bdi false.
	    if (!wb_has_dirty_io(&bdi->wb))
		    wakeup_bdi = true;
	}
	spin_unlock(&inode->i_lock);
	                                                            <<< assume last dirty_io is removed here.
	                                                            pages_written = wb_do_writeback(wb);
	                                                            ...
	                                                            <<< work_list empty and wb has no dirty_io,
	                                                            <<< delayed_work will not be queued.
	                                                            if (!list_empty(&bdi->work_list) ||
	                                                                (wb_has_dirty_io(wb) && dirty_writeback_interval))
	                                                                queue_delayed_work(bdi_wq, &wb->dwork,
	                                                                    msecs_to_jiffies(dirty_writeback_interval * 10));
	spin_lock(&bdi->wb.list_lock);
	inode->dirtied_when = jiffies;
	<<< new dirty_io is added.
	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
	spin_unlock(&bdi->wb.list_lock);

	<<< though there is dirty_io, but wakeup_bdi is false,
	<<< so writeback thread will not be waked up and
	<<< the new dirty_io will not be flushed.
	if (wakeup_bdi)
	    bdi_wakeup_thread_delayed(bdi);

Writeback will run until there is a new flush work queued.
This may cause a lot of dirty pages stay in memory for a long time.

Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
---
 fs/fs-writeback.c |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 68851ff..72e6275 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1140,6 +1140,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
+	bdi = inode_to_bdi(inode);
+	spin_lock(&bdi->wb.list_lock);
 	spin_lock(&inode->i_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
@@ -1171,7 +1173,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		if (!was_dirty) {
 			bool wakeup_bdi = false;
-			bdi = inode_to_bdi(inode);
 
 			if (bdi_cap_writeback_dirty(bdi)) {
 				WARN(!test_bit(BDI_registered, &bdi->state),
@@ -1187,10 +1188,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 					wakeup_bdi = true;
 			}
 
-			spin_unlock(&inode->i_lock);
-			spin_lock(&bdi->wb.list_lock);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&bdi->wb.list_lock);
 
 			if (wakeup_bdi)
@@ -1200,6 +1200,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	}
 out_unlock_inode:
 	spin_unlock(&inode->i_lock);
+	spin_unlock(&bdi->wb.list_lock);
 
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] writeback: fix race that cause writeback hung
  2013-08-16  8:57 [PATCH] writeback: fix race that cause writeback hung Junxiao Bi
@ 2013-08-16 13:53 ` Jan Kara
  0 siblings, 0 replies; 3+ messages in thread
From: Jan Kara @ 2013-08-16 13:53 UTC (permalink / raw)
  To: Junxiao Bi; +Cc: linux-fsdevel, fengguang.wu

On Fri 16-08-13 16:57:39, Junxiao Bi wrote:
> There is a race between mark inode dirty and writeback thread,
> see the following scenario. In this case, writeback thread will
> not run though there is dirty_io.
> 
> __mark_inode_dirty()                                          bdi_writeback_workfn()
> 	...                                                       	...
> 	spin_lock(&inode->i_lock);
> 	...
> 	if (bdi_cap_writeback_dirty(bdi)) {
> 	    <<< assume wb has dirty_io, so wakeup_bdi is false.
> 	    <<< the following inode_dirty also have wakeup_bdi false.
> 	    if (!wb_has_dirty_io(&bdi->wb))
> 		    wakeup_bdi = true;
> 	}
> 	spin_unlock(&inode->i_lock);
> 	                                                            <<< assume last dirty_io is removed here.
> 	                                                            pages_written = wb_do_writeback(wb);
> 	                                                            ...
> 	                                                            <<< work_list empty and wb has no dirty_io,
> 	                                                            <<< delayed_work will not be queued.
> 	                                                            if (!list_empty(&bdi->work_list) ||
> 	                                                                (wb_has_dirty_io(wb) && dirty_writeback_interval))
> 	                                                                queue_delayed_work(bdi_wq, &wb->dwork,
> 	                                                                    msecs_to_jiffies(dirty_writeback_interval * 10));
> 	spin_lock(&bdi->wb.list_lock);
> 	inode->dirtied_when = jiffies;
> 	<<< new dirty_io is added.
> 	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> 	spin_unlock(&bdi->wb.list_lock);
> 
> 	<<< though there is dirty_io, but wakeup_bdi is false,
> 	<<< so writeback thread will not be waked up and
> 	<<< the new dirty_io will not be flushed.
> 	if (wakeup_bdi)
> 	    bdi_wakeup_thread_delayed(bdi);
> 
> Writeback will run until there is a new flush work queued.
> This may cause a lot of dirty pages stay in memory for a long time.
  Hum, I thought I was already fixing this race but apparently I wasn't...

> Cc: Fengguang Wu <fengguang.wu@intel.com>
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>  fs/fs-writeback.c |    7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 68851ff..72e6275 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1140,6 +1140,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	if (unlikely(block_dump))
>  		block_dump___mark_inode_dirty(inode);
>  
> +	bdi = inode_to_bdi(inode);
> +	spin_lock(&bdi->wb.list_lock);
>  	spin_lock(&inode->i_lock);
  Is it really necessary to move list_lock so early? Won't it be enough to
move dropping of i_lock and acquisition of list_lock just after doing
inode_to_bdi()? Then wb_has_dirty_io() and list_add() would be both under
list_lock and things should be fine and we'd have shorter critical
sections...

									Honza
>  	if ((inode->i_state & flags) != flags) {
>  		const int was_dirty = inode->i_state & I_DIRTY;
> @@ -1171,7 +1173,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  		 */
>  		if (!was_dirty) {
>  			bool wakeup_bdi = false;
> -			bdi = inode_to_bdi(inode);
>  
>  			if (bdi_cap_writeback_dirty(bdi)) {
>  				WARN(!test_bit(BDI_registered, &bdi->state),
> @@ -1187,10 +1188,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  					wakeup_bdi = true;
>  			}
>  
> -			spin_unlock(&inode->i_lock);
> -			spin_lock(&bdi->wb.list_lock);
>  			inode->dirtied_when = jiffies;
>  			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> +			spin_unlock(&inode->i_lock);
>  			spin_unlock(&bdi->wb.list_lock);
>  
>  			if (wakeup_bdi)
> @@ -1200,6 +1200,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	}
>  out_unlock_inode:
>  	spin_unlock(&inode->i_lock);
> +	spin_unlock(&bdi->wb.list_lock);
>  
>  }
>  EXPORT_SYMBOL(__mark_inode_dirty);
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] writeback: fix race that cause writeback hung
@ 2013-08-17  3:28 Junxiao Bi
  0 siblings, 0 replies; 3+ messages in thread
From: Junxiao Bi @ 2013-08-17  3:28 UTC (permalink / raw)
  To: jack; +Cc: fengguang.wu, linux-fsdevel

Hi Jan,

----- Original Message -----
From: jack@suse.cz
To: junxiao.bi@oracle.com
Cc: linux-fsdevel@vger.kernel.org, fengguang.wu@intel.com
Sent: Friday, August 16, 2013 9:53:29 PM GMT +08:00 Beijing / Chongqing / Hong Kong / Urumqi
Subject: Re: [PATCH] writeback: fix race that cause writeback hung

On Fri 16-08-13 16:57:39, Junxiao Bi wrote:
> There is a race between mark inode dirty and writeback thread,
> see the following scenario. In this case, writeback thread will
> not run though there is dirty_io.
> 
> __mark_inode_dirty()                                          bdi_writeback_workfn()
> 	...                                                       	...
> 	spin_lock(&inode->i_lock);
> 	...
> 	if (bdi_cap_writeback_dirty(bdi)) {
> 	    <<< assume wb has dirty_io, so wakeup_bdi is false.
> 	    <<< the following inode_dirty also have wakeup_bdi false.
> 	    if (!wb_has_dirty_io(&bdi->wb))
> 		    wakeup_bdi = true;
> 	}
> 	spin_unlock(&inode->i_lock);
> 	                                                            <<< assume last dirty_io is removed here.
> 	                                                            pages_written = wb_do_writeback(wb);
> 	                                                            ...
> 	                                                            <<< work_list empty and wb has no dirty_io,
> 	                                                            <<< delayed_work will not be queued.
> 	                                                            if (!list_empty(&bdi->work_list) ||
> 	                                                                (wb_has_dirty_io(wb) && dirty_writeback_interval))
> 	                                                                queue_delayed_work(bdi_wq, &wb->dwork,
> 	                                                                    msecs_to_jiffies(dirty_writeback_interval * 10));
> 	spin_lock(&bdi->wb.list_lock);
> 	inode->dirtied_when = jiffies;
> 	<<< new dirty_io is added.
> 	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> 	spin_unlock(&bdi->wb.list_lock);
> 
> 	<<< though there is dirty_io, but wakeup_bdi is false,
> 	<<< so writeback thread will not be waked up and
> 	<<< the new dirty_io will not be flushed.
> 	if (wakeup_bdi)
> 	    bdi_wakeup_thread_delayed(bdi);
> 
> Writeback will run until there is a new flush work queued.
> This may cause a lot of dirty pages stay in memory for a long time.
  Hum, I thought I was already fixing this race but apparently I wasn't...

> Cc: Fengguang Wu <fengguang.wu@intel.com>
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>  fs/fs-writeback.c |    7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 68851ff..72e6275 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1140,6 +1140,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	if (unlikely(block_dump))
>  		block_dump___mark_inode_dirty(inode);
>  
> +	bdi = inode_to_bdi(inode);
> +	spin_lock(&bdi->wb.list_lock);
>  	spin_lock(&inode->i_lock);
  Is it really necessary to move list_lock so early? Won't it be enough to
move dropping of i_lock and acquisition of list_lock just after doing
inode_to_bdi()? Then wb_has_dirty_io() and list_add() would be both under
list_lock and things should be fine and we'd have shorter critical
sections...

Yes, agree, thanks for review it, will send v2 patch.

Thanks,
Junxiao.

									Honza
>  	if ((inode->i_state & flags) != flags) {
>  		const int was_dirty = inode->i_state & I_DIRTY;
> @@ -1171,7 +1173,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  		 */
>  		if (!was_dirty) {
>  			bool wakeup_bdi = false;
> -			bdi = inode_to_bdi(inode);
>  
>  			if (bdi_cap_writeback_dirty(bdi)) {
>  				WARN(!test_bit(BDI_registered, &bdi->state),
> @@ -1187,10 +1188,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  					wakeup_bdi = true;
>  			}
>  
> -			spin_unlock(&inode->i_lock);
> -			spin_lock(&bdi->wb.list_lock);
>  			inode->dirtied_when = jiffies;
>  			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
> +			spin_unlock(&inode->i_lock);
>  			spin_unlock(&bdi->wb.list_lock);
>  
>  			if (wakeup_bdi)
> @@ -1200,6 +1200,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  	}
>  out_unlock_inode:
>  	spin_unlock(&inode->i_lock);
> +	spin_unlock(&bdi->wb.list_lock);
>  
>  }
>  EXPORT_SYMBOL(__mark_inode_dirty);
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-08-17  3:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-16  8:57 [PATCH] writeback: fix race that cause writeback hung Junxiao Bi
2013-08-16 13:53 ` Jan Kara
  -- strict thread matches above, loose matches on Subject: below --
2013-08-17  3:28 Junxiao Bi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).