linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Zheng Liu <gnehzuil.liu@gmail.com>
To: Jan Kara <jack@suse.cz>
Cc: Ted Tso <tytso@mit.edu>, linux-ext4@vger.kernel.org
Subject: Re: [PATCH 20/29] ext4: Use transaction reservation for extent conversion in ext4_end_io
Date: Wed, 8 May 2013 14:57:13 +0800	[thread overview]
Message-ID: <20130508065713.GB20599@gmail.com> (raw)
In-Reply-To: <1365456754-29373-21-git-send-email-jack@suse.cz>

On Mon, Apr 08, 2013 at 11:32:25PM +0200, Jan Kara wrote:
> Later we would like to clear PageWriteback bit only after extent conversion
> from unwritten to written extents is performed. However it is not possible
> to start a transaction after PageWriteback is set because that violates
> lock ordering (and is easy to deadlock). So we have to reserve a transaction
> before locking pages and sending them for IO and later we use the transaction
> for extent conversion from ext4_end_io().
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Regards,
                                                - Zheng

> ---
>  fs/ext4/ext4.h      |   12 +++++++++---
>  fs/ext4/ext4_jbd2.h |    3 ++-
>  fs/ext4/extents.c   |   39 ++++++++++++++++++++++++++++-----------
>  fs/ext4/inode.c     |   32 ++++++++++++++++++++++++++++++--
>  fs/ext4/page-io.c   |   11 ++++++++---
>  5 files changed, 77 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3c3827a..65adf0d 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -182,10 +182,13 @@ struct ext4_map_blocks {
>  #define EXT4_IO_END_DIRECT	0x0004
>  
>  /*
> - * For converting uninitialized extents on a work queue.
> + * For converting uninitialized extents on a work queue. 'handle' is used for
> + * buffered writeback.
>   */
>  typedef struct ext4_io_end {
>  	struct list_head	list;		/* per-file finished IO list */
> +	handle_t		*handle;	/* handle reserved for extent
> +						 * conversion */
>  	struct inode		*inode;		/* file being written to */
>  	unsigned int		flag;		/* unwritten or not */
>  	loff_t			offset;		/* offset in the file */
> @@ -1314,6 +1317,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
>  					      struct ext4_io_end *io_end)
>  {
>  	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> +		/* Writeback has to have coversion transaction reserved */
> +		WARN_ON(!io_end->handle &&
> +			!(io_end->flag & EXT4_IO_END_DIRECT));
>  		io_end->flag |= EXT4_IO_END_UNWRITTEN;
>  		atomic_inc(&EXT4_I(inode)->i_unwritten);
>  	}
> @@ -2550,8 +2556,8 @@ extern void ext4_ext_init(struct super_block *);
>  extern void ext4_ext_release(struct super_block *);
>  extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
>  			  loff_t len);
> -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> -			  ssize_t len);
> +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
> +					  loff_t offset, ssize_t len);
>  extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  			   struct ext4_map_blocks *map, int flags);
>  extern int ext4_ext_calc_metadata_amount(struct inode *inode,
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index bb17931..88e95d7 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -132,7 +132,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
>  #define EXT4_HT_MIGRATE          8
>  #define EXT4_HT_MOVE_EXTENTS     9
>  #define EXT4_HT_XATTR           10
> -#define EXT4_HT_MAX             11
> +#define EXT4_HT_EXT_CONVERT     11
> +#define EXT4_HT_MAX             12
>  
>  /**
>   *   struct ext4_journal_cb_entry - Base structure for callback information.
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 8064b71..ae22735 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4484,10 +4484,9 @@ retry:
>   * function, to convert the fallocated extents after IO is completed.
>   * Returns 0 on success.
>   */
> -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> -				    ssize_t len)
> +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
> +				   loff_t offset, ssize_t len)
>  {
> -	handle_t *handle;
>  	unsigned int max_blocks;
>  	int ret = 0;
>  	int ret2 = 0;
> @@ -4502,16 +4501,31 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
>  	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
>  		      map.m_lblk);
>  	/*
> -	 * credits to insert 1 extent into extent tree
> +	 * This is somewhat ugly but the idea is clear: When transaction is
> +	 * reserved, everything goes into it. Otherwise we rather start several
> +	 * smaller transactions for conversion of each extent separately.
>  	 */
> -	credits = ext4_chunk_trans_blocks(inode, max_blocks);
> +	if (handle) {
> +		handle = ext4_journal_start_reserved(handle);
> +		if (IS_ERR(handle))
> +			return PTR_ERR(handle);
> +		credits = 0;
> +	} else {
> +		/*
> +		 * credits to insert 1 extent into extent tree
> +		 */
> +		credits = ext4_chunk_trans_blocks(inode, max_blocks);
> +	}
>  	while (ret >= 0 && ret < max_blocks) {
>  		map.m_lblk += ret;
>  		map.m_len = (max_blocks -= ret);
> -		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
> -		if (IS_ERR(handle)) {
> -			ret = PTR_ERR(handle);
> -			break;
> +		if (credits) {
> +			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
> +						    credits);
> +			if (IS_ERR(handle)) {
> +				ret = PTR_ERR(handle);
> +				break;
> +			}
>  		}
>  		ret = ext4_map_blocks(handle, inode, &map,
>  				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
> @@ -4522,10 +4536,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
>  				     inode->i_ino, map.m_lblk,
>  				     map.m_len, ret);
>  		ext4_mark_inode_dirty(handle, inode);
> -		ret2 = ext4_journal_stop(handle);
> -		if (ret <= 0 || ret2 )
> +		if (credits)
> +			ret2 = ext4_journal_stop(handle);
> +		if (ret <= 0 || ret2)
>  			break;
>  	}
> +	if (!credits)
> +		ret2 = ext4_journal_stop(handle);
>  	return ret > 0 ? ret2 : ret;
>  }
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 0602a09..f8e78ce 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1327,6 +1327,8 @@ static void ext4_da_page_release_reservation(struct page *page,
>  struct mpage_da_data {
>  	struct inode *inode;
>  	struct writeback_control *wbc;
> +	handle_t *reserved_handle;	/* Handle reserved for conversion */
> +
>  	pgoff_t first_page;	/* The first page to write */
>  	pgoff_t next_page;	/* Current page to examine */
>  	pgoff_t last_page;	/* Last page to examine */
> @@ -1973,8 +1975,13 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
>  	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
>  	if (err < 0)
>  		return err;
> -	if (map->m_flags & EXT4_MAP_UNINIT)
> +	if (map->m_flags & EXT4_MAP_UNINIT) {
> +		if (!mpd->io_submit.io_end->handle) {
> +			mpd->io_submit.io_end->handle = mpd->reserved_handle;
> +			mpd->reserved_handle = NULL;
> +		}
>  		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
> +	}
>  
>  	BUG_ON(map->m_len == 0);
>  	if (map->m_flags & EXT4_MAP_NEW) {
> @@ -2274,6 +2281,7 @@ static int ext4_da_writepages(struct address_space *mapping,
>  
>  	mpd.inode = inode;
>  	mpd.wbc = wbc;
> +	mpd.reserved_handle = NULL;
>  	ext4_io_submit_init(&mpd.io_submit, wbc);
>  retry:
>  	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> @@ -2288,6 +2296,23 @@ retry:
>  			break;
>  		}
>  
> +		/* Reserve handle if it may be needed for extent conversion */
> +		if (ext4_should_dioread_nolock(inode) && !mpd.reserved_handle) {
> +			/*
> +			 * We may need to convert upto one extent per block in
> +			 * the page and we may dirty the inode.
> +			 */
> +			mpd.reserved_handle = ext4_journal_reserve(inode,
> +				EXT4_HT_EXT_CONVERT,
> +				1 + (PAGE_CACHE_SIZE >> inode->i_blkbits));
> +			if (IS_ERR(mpd.reserved_handle)) {
> +				ret = PTR_ERR(mpd.reserved_handle);
> +				mpd.reserved_handle = NULL;
> +				ext4_put_io_end(mpd.io_submit.io_end);
> +				break;
> +			}
> +		}
> +
>  		/*
>  		 * We have two constraints: We find one extent to map and we
>  		 * must always write out whole page (makes a difference when
> @@ -2364,6 +2389,9 @@ retry:
>  		 */
>  		mapping->writeback_index = mpd.first_page;
>  
> +	if (mpd.reserved_handle)
> +		ext4_journal_free_reserved(mpd.reserved_handle);
> +
>  out_writepages:
>  	trace_ext4_da_writepages_result(inode, wbc, ret,
>  					nr_to_write - wbc->nr_to_write);
> @@ -2977,7 +3005,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  		 * for non AIO case, since the IO is already
>  		 * completed, we could do the conversion right here
>  		 */
> -		err = ext4_convert_unwritten_extents(inode,
> +		err = ext4_convert_unwritten_extents(NULL, inode,
>  						     offset, ret);
>  		if (err < 0)
>  			ret = err;
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index cc59cd9..e8ee4da 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -55,6 +55,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
>  {
>  	BUG_ON(!list_empty(&io_end->list));
>  	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
> +	WARN_ON(io_end->handle);
>  
>  	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
>  		wake_up_all(ext4_ioend_wq(io_end->inode));
> @@ -81,13 +82,15 @@ static int ext4_end_io(ext4_io_end_t *io)
>  	struct inode *inode = io->inode;
>  	loff_t offset = io->offset;
>  	ssize_t size = io->size;
> +	handle_t *handle = io->handle;
>  	int ret = 0;
>  
>  	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
>  
> -	ret = ext4_convert_unwritten_extents(inode, offset, size);
> +	io->handle = NULL;	/* Following call will use up the handle */
> +	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
>  	if (ret < 0) {
>  		ext4_msg(inode->i_sb, KERN_EMERG,
>  			 "failed to convert unwritten extents to written "
> @@ -217,8 +220,10 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
>  
>  	if (atomic_dec_and_test(&io_end->count)) {
>  		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
> -			err = ext4_convert_unwritten_extents(io_end->inode,
> -						io_end->offset, io_end->size);
> +			err = ext4_convert_unwritten_extents(io_end->handle,
> +						io_end->inode, io_end->offset,
> +						io_end->size);
> +			io_end->handle = NULL;
>  			ext4_clear_io_unwritten_flag(io_end);
>  		}
>  		ext4_release_io_end(io_end);
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

  reply	other threads:[~2013-05-08  6:39 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-08 21:32 [PATCH 00/22 v1] Fixes and improvements in ext4 writeback path Jan Kara
2013-04-08 21:32 ` [PATCH 01/29] ext4: Make ext4_bio_write_page() use BH_Async_Write flags instead page pointers from ext4_io_end Jan Kara
2013-04-10 18:05   ` Dmitry Monakhov
2013-04-11 13:38   ` Zheng Liu
2013-04-12  3:50   ` Theodore Ts'o
2013-04-08 21:32 ` [PATCH 02/29] ext4: Use io_end for multiple bios Jan Kara
2013-04-11  5:10   ` Dmitry Monakhov
2013-04-11 14:04   ` Zheng Liu
2013-04-12  3:55   ` Theodore Ts'o
2013-04-08 21:32 ` [PATCH 03/29] ext4: Clear buffer_uninit flag when submitting IO Jan Kara
2013-04-11 14:08   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 04/29] jbd2: Reduce journal_head size Jan Kara
2013-04-11 14:10   ` Zheng Liu
2013-04-12  4:04   ` Theodore Ts'o
2013-04-08 21:32 ` [PATCH 05/29] jbd2: Don't create journal_head for temporary journal buffers Jan Kara
2013-04-12  8:01   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 06/29] jbd2: Remove journal_head from descriptor buffers Jan Kara
2013-04-12  8:10   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 07/29] jbd2: Refine waiting for shadow buffers Jan Kara
2013-05-03 14:16   ` Zheng Liu
2013-05-03 20:44     ` Jan Kara
2013-04-08 21:32 ` [PATCH 08/29] jbd2: Remove outdated comment Jan Kara
2013-05-03 14:20   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 09/29] jbd2: Cleanup needed free block estimates when starting a transaction Jan Kara
2013-05-05  8:17   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 10/29] jbd2: Fix race in t_outstanding_credits update in jbd2_journal_extend() Jan Kara
2013-05-05  8:37   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 11/29] jbd2: Remove unused waitqueues Jan Kara
2013-05-05  8:41   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 12/29] jbd2: Transaction reservation support Jan Kara
2013-05-05  9:39   ` Zheng Liu
2013-05-06 12:49     ` Jan Kara
2013-05-07  5:22       ` Zheng Liu
2013-04-08 21:32 ` [PATCH 13/29] ext4: Provide wrappers for transaction reservation calls Jan Kara
2013-05-05 11:51   ` Zheng Liu
2013-05-05 11:58   ` Zheng Liu
2013-05-06 12:51     ` Jan Kara
2013-04-08 21:32 ` [PATCH 14/29] ext4: Stop messing with nr_to_write in ext4_da_writepages() Jan Kara
2013-05-05 12:40   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 15/29] ext4: Deprecate max_writeback_mb_bump sysfs attribute Jan Kara
2013-05-05 12:47   ` Zheng Liu
2013-05-06 12:55     ` Jan Kara
2013-04-08 21:32 ` [PATCH 16/29] ext4: Improve writepage credit estimate for files with indirect blocks Jan Kara
2013-05-07  5:39   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 17/29] ext4: Better estimate credits needed for ext4_da_writepages() Jan Kara
2013-05-07  6:33   ` Zheng Liu
2013-05-07 14:17     ` Jan Kara
2013-04-08 21:32 ` [PATCH 18/29] ext4: Restructure writeback path Jan Kara
2013-05-08  3:48   ` Zheng Liu
2013-05-08 11:20     ` Jan Kara
2013-04-08 21:32 ` [PATCH 19/29] ext4: Remove buffer_uninit handling Jan Kara
2013-05-08  6:56   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 20/29] ext4: Use transaction reservation for extent conversion in ext4_end_io Jan Kara
2013-05-08  6:57   ` Zheng Liu [this message]
2013-04-08 21:32 ` [PATCH 21/29] ext4: Split extent conversion lists to reserved & unreserved parts Jan Kara
2013-05-08  7:03   ` Zheng Liu
2013-05-08 11:23     ` Jan Kara
2013-05-08 11:49       ` Zheng Liu
2013-04-08 21:32 ` [PATCH 22/29] ext4: Defer clearing of PageWriteback after extent conversion Jan Kara
2013-05-08  7:08   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 23/29] ext4: Protect extent conversion after DIO with i_dio_count Jan Kara
2013-05-08  7:08   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 24/29] ext4: Remove wait for unwritten extent conversion from ext4_ext_truncate() Jan Kara
2013-05-08  7:35   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 25/29] ext4: Use generic_file_fsync() in ext4_file_fsync() in nojournal mode Jan Kara
2013-05-08  7:37   ` Zheng Liu
2013-05-08 11:29     ` Jan Kara
2013-04-08 21:32 ` [PATCH 26/29] ext4: Remove i_mutex from ext4_file_sync() Jan Kara
2013-05-08  7:41   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 27/29] ext4: Remove wait for unwritten extents in ext4_ind_direct_IO() Jan Kara
2013-05-08  7:55   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 28/29] ext4: Don't wait for extent conversion in ext4_ext_punch_hole() Jan Kara
2013-05-08  7:56   ` Zheng Liu
2013-04-08 21:32 ` [PATCH 29/29] ext4: Remove ext4_ioend_wait() Jan Kara
2013-05-08  7:57   ` Zheng Liu
2013-05-08 11:32     ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130508065713.GB20599@gmail.com \
    --to=gnehzuil.liu@gmail.com \
    --cc=jack@suse.cz \
    --cc=linux-ext4@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).