From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from userp1040.oracle.com ([156.151.31.81]:38811 "EHLO userp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752721AbbFYDQ1 (ORCPT ); Wed, 24 Jun 2015 23:16:27 -0400 Date: Thu, 25 Jun 2015 11:16:16 +0800 From: Liu Bo To: Mark Fasheh Cc: Chris Mason , Josef Bacik , David Sterba , linux-btrfs@vger.kernel.org Subject: Re: [PATCH 2/5] btrfs: fix deadlock with extent-same and readpage Message-ID: <20150625031615.GD23794@localhost.localdomain> Reply-To: bo.li.liu@oracle.com References: <1435094920-22442-1-git-send-email-mfasheh@suse.de> <1435094920-22442-3-git-send-email-mfasheh@suse.de> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii In-Reply-To: <1435094920-22442-3-git-send-email-mfasheh@suse.de> Sender: linux-btrfs-owner@vger.kernel.org List-ID: On Tue, Jun 23, 2015 at 02:28:37PM -0700, Mark Fasheh wrote: > ->readpage() does page_lock() before extent_lock(), we do the opposite in > extent-same. We want to reverse the order in btrfs_extent_same() but it's > not quite straightforward since the page locks are taken inside btrfs_cmp_data(). > > So I split btrfs_cmp_data() into 3 parts with a small context structure that > is passed between them. The first, btrfs_cmp_data_prepare() gathers up the > pages needed (taking page lock as required) and puts them on our context > structure. At this point, we are safe to lock the extent range. Afterwards, > we use btrfs_cmp_data() to do the data compare as usual and btrfs_cmp_data_free() > to clean up our context. Reviewed-by: Liu Bo > > Signed-off-by: Mark Fasheh > Reviewed-by: David Sterba > --- > fs/btrfs/ioctl.c | 148 +++++++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 117 insertions(+), 31 deletions(-) > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index 2deea1f..b899584 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -2755,14 +2755,11 @@ out: > return ret; > } > > -static struct page *extent_same_get_page(struct inode *inode, u64 off) > +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) > { > struct page *page; > - pgoff_t index; > struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; > > - index = off >> PAGE_CACHE_SHIFT; > - > page = grab_cache_page(inode->i_mapping, index); > if (!page) > return NULL; > @@ -2783,6 +2780,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) > return page; > } > > +static int gather_extent_pages(struct inode *inode, struct page **pages, > + int num_pages, u64 off) > +{ > + int i; > + pgoff_t index = off >> PAGE_CACHE_SHIFT; > + > + for (i = 0; i < num_pages; i++) { > + pages[i] = extent_same_get_page(inode, index + i); > + if (!pages[i]) > + return -ENOMEM; > + } > + return 0; > +} > + > static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) > { > /* do any pending delalloc/csum calc on src, one way or > @@ -2808,52 +2819,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) > } > } > > -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, > - struct inode *inode2, u64 loff2, u64 len) > +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) > { > - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); > - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); > - > mutex_unlock(&inode1->i_mutex); > mutex_unlock(&inode2->i_mutex); > } > > -static void btrfs_double_lock(struct inode *inode1, u64 loff1, > - struct inode *inode2, u64 loff2, u64 len) > +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) > +{ > + if (inode1 < inode2) > + swap(inode1, inode2); > + > + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); > + if (inode1 != inode2) > + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); > +} > + > +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, > + struct inode *inode2, u64 loff2, u64 len) > +{ > + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); > + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); > +} > + > +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, > + struct inode *inode2, u64 loff2, u64 len) > { > if (inode1 < inode2) { > swap(inode1, inode2); > swap(loff1, loff2); > } > - > - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); > lock_extent_range(inode1, loff1, len); > - if (inode1 != inode2) { > - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); > + if (inode1 != inode2) > lock_extent_range(inode2, loff2, len); > +} > + > +struct cmp_pages { > + int num_pages; > + struct page **src_pages; > + struct page **dst_pages; > +}; > + > +static void btrfs_cmp_data_free(struct cmp_pages *cmp) > +{ > + int i; > + struct page *pg; > + > + for (i = 0; i < cmp->num_pages; i++) { > + pg = cmp->src_pages[i]; > + if (pg) > + page_cache_release(pg); > + pg = cmp->dst_pages[i]; > + if (pg) > + page_cache_release(pg); > + } > + kfree(cmp->src_pages); > + kfree(cmp->dst_pages); > +} > + > +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, > + struct inode *dst, u64 dst_loff, > + u64 len, struct cmp_pages *cmp) > +{ > + int ret; > + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; > + struct page **src_pgarr, **dst_pgarr; > + > + /* > + * We must gather up all the pages before we initiate our > + * extent locking. We use an array for the page pointers. Size > + * of the array is bounded by len, which is in turn bounded by > + * BTRFS_MAX_DEDUPE_LEN. > + */ > + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); > + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); > + if (!src_pgarr || !dst_pgarr) { > + kfree(src_pgarr); > + kfree(dst_pgarr); > + return -ENOMEM; > } > + cmp->num_pages = num_pages; > + cmp->src_pages = src_pgarr; > + cmp->dst_pages = dst_pgarr; > + > + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); > + if (ret) > + goto out; > + > + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); > + > +out: > + if (ret) > + btrfs_cmp_data_free(cmp); > + return 0; > } > > static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, > - u64 dst_loff, u64 len) > + u64 dst_loff, u64 len, struct cmp_pages *cmp) > { > int ret = 0; > + int i; > struct page *src_page, *dst_page; > unsigned int cmp_len = PAGE_CACHE_SIZE; > void *addr, *dst_addr; > > + i = 0; > while (len) { > if (len < PAGE_CACHE_SIZE) > cmp_len = len; > > - src_page = extent_same_get_page(src, loff); > - if (!src_page) > - return -EINVAL; > - dst_page = extent_same_get_page(dst, dst_loff); > - if (!dst_page) { > - page_cache_release(src_page); > - return -EINVAL; > - } > + BUG_ON(i >= cmp->num_pages); > + > + src_page = cmp->src_pages[i]; > + dst_page = cmp->dst_pages[i]; > + > addr = kmap_atomic(src_page); > dst_addr = kmap_atomic(dst_page); > > @@ -2865,15 +2944,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, > > kunmap_atomic(addr); > kunmap_atomic(dst_addr); > - page_cache_release(src_page); > - page_cache_release(dst_page); > > if (ret) > break; > > - loff += cmp_len; > - dst_loff += cmp_len; > len -= cmp_len; > + i++; > } > > return ret; > @@ -2904,6 +2980,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, > { > int ret; > u64 len = olen; > + struct cmp_pages cmp; > > /* > * btrfs_clone() can't handle extents in the same file > @@ -2916,7 +2993,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, > if (len == 0) > return 0; > > - btrfs_double_lock(src, loff, dst, dst_loff, len); > + btrfs_double_inode_lock(src, dst); > > ret = extent_same_check_offsets(src, loff, &len, olen); > if (ret) > @@ -2933,13 +3010,22 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, > goto out_unlock; > } > > + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); > + if (ret) > + goto out_unlock; > + > + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); > + > /* pass original length for comparison so we stay within i_size */ > - ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen); > + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); > if (ret == 0) > ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); > > + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); > + > + btrfs_cmp_data_free(&cmp); > out_unlock: > - btrfs_double_unlock(src, loff, dst, dst_loff, len); > + btrfs_double_inode_unlock(src, dst); > > return ret; > } > -- > 2.1.2 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html