* [PATCH] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork
@ 2022-07-13 7:30 Chandan Babu R
2022-07-13 10:45 ` [PATCH V2] " Chandan Babu R
0 siblings, 1 reply; 5+ messages in thread
From: Chandan Babu R @ 2022-07-13 7:30 UTC (permalink / raw)
To: linux-xfs; +Cc: Chandan Babu R, Wengang Wang
On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error
even though the filesystem has sufficient number of free blocks.
This occurs if the file offset range on which the write operation is being
performed has a delalloc extent in the cow fork and this delalloc extent
begins much before the Direct IO range.
In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to
allocate the blocks mapped by the delalloc extent. The extent thus allocated
may not cover the beginning of file offset range on which the Direct IO write
was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC.
The following script reliably recreates the bug described above.
#!/usr/bin/bash
device=/dev/loop0
shortdev=$(basename $device)
mntpnt=/mnt/
file1=${mntpnt}/file1
file2=${mntpnt}/file2
fragmentedfile=${mntpnt}/fragmentedfile
punchprog=/root/repos/xfstests-dev/src/punch-alternating
errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent
umount $device > /dev/null 2>&1
echo "Create FS"
mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1
if [[ $? != 0 ]]; then
echo "mkfs failed."
exit 1
fi
echo "Mount FS"
mount $device $mntpnt > /dev/null 2>&1
if [[ $? != 0 ]]; then
echo "mount failed."
exit 1
fi
echo "Create source file"
xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1
sync
echo "Create Reflinked file"
xfs_io -f -c "reflink $file1" $file2 &>/dev/null
echo "Set cowextsize"
xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1
echo "Fragment FS"
xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1
sync
$punchprog $fragmentedfile
echo "Allocate block sized extent from now onwards"
echo -n 1 > $errortag
echo "Create 16MiB delalloc extent in CoW fork"
xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1
sync
echo "Direct I/O write at offset 12k"
xfs_io -d -c "pwrite 12k 8k" $file1
This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk
blocks are allocated for atleast the starting file offset of the Direct IO
write range.
Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
---
fs/xfs/xfs_reflink.c | 225 +++++++++++++++++++++++++++++++++++--------
1 file changed, 187 insertions(+), 38 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e7a7c00d93be..ab7a39244920 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -340,9 +340,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_alloc_cow_unwritten(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -351,33 +383,17 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- xfs_iunlock(ip, *lockmode);
- *lockmode = 0;
-
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
false, &tp);
if (error)
@@ -385,17 +401,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
+ ASSERT(cmap->br_startoff > imap->br_startoff);
+
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -415,19 +431,9 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
- if (!error)
- cmap->br_state = XFS_EXT_NORM;
+ error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
return error;
out_trans_cancel:
@@ -435,6 +441,149 @@ xfs_reflink_allocate_cow(
return error;
}
+static int
+xfs_replace_delalloc_cow_extent(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ while (1) {
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ goto convert;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
+ &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even partially
+ * satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+
+ if (cmap->br_startoff + cmap->br_blockcount > imap->br_startoff)
+ break;
+
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+ }
+
+convert:
+ error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /*
+ * We have to deal with one of the following 2 cases,
+ * 1. No extent in CoW fork and shared extent in Data fork.
+ * 2. CoW fork has one of the following:
+ * - Unwritten/written extent in CoW fork.
+ * - Delalloc extent in CoW fork; An extent may or may not be present
+ * in the Data fork.
+ */
+
+ if (found) {
+ /*
+ * CoW fork has a real extent; Convert it to written if is an
+ * unwritten extent.
+ */
+ error = xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+ return error;
+ }
+
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ if (cmap->br_startoff > imap->br_startoff) {
+ /*
+ * CoW fork does not have an extent. Hence, allocate a real
+ * extent in the CoW fork.
+ */
+ error = xfs_reflink_alloc_cow_unwritten(ip, imap, cmap, shared,
+ lockmode, convert_now);
+ } else if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK) {
+ /*
+ * CoW fork has a delalloc extent. Replace it with a real
+ * extent.
+ */
+ error = xfs_replace_delalloc_cow_extent(ip, imap, cmap, shared,
+ lockmode, convert_now);
+ } else {
+ ASSERT(0);
+ }
+
+ return error;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
--
2.35.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH V2] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork 2022-07-13 7:30 [PATCH] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork Chandan Babu R @ 2022-07-13 10:45 ` Chandan Babu R 2022-08-04 17:35 ` Darrick J. Wong 0 siblings, 1 reply; 5+ messages in thread From: Chandan Babu R @ 2022-07-13 10:45 UTC (permalink / raw) To: linux-xfs; +Cc: Chandan Babu R, Wengang Wang On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error even though the filesystem has sufficient number of free blocks. This occurs if the file offset range on which the write operation is being performed has a delalloc extent in the cow fork and this delalloc extent begins much before the Direct IO range. In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to allocate the blocks mapped by the delalloc extent. The extent thus allocated may not cover the beginning of file offset range on which the Direct IO write was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC. The following script reliably recreates the bug described above. #!/usr/bin/bash device=/dev/loop0 shortdev=$(basename $device) mntpnt=/mnt/ file1=${mntpnt}/file1 file2=${mntpnt}/file2 fragmentedfile=${mntpnt}/fragmentedfile punchprog=/root/repos/xfstests-dev/src/punch-alternating errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent umount $device > /dev/null 2>&1 echo "Create FS" mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mkfs failed." exit 1 fi echo "Mount FS" mount $device $mntpnt > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mount failed." exit 1 fi echo "Create source file" xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1 sync echo "Create Reflinked file" xfs_io -f -c "reflink $file1" $file2 &>/dev/null echo "Set cowextsize" xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1 echo "Fragment FS" xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 sync $punchprog $fragmentedfile echo "Allocate block sized extent from now onwards" echo -n 1 > $errortag echo "Create 16MiB delalloc extent in CoW fork" xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1 sync echo "Direct I/O write at offset 12k" xfs_io -d -c "pwrite 12k 8k" $file1 This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk blocks are allocated for atleast the starting file offset of the Direct IO write range. Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com> Signed-off-by: Chandan Babu R <chandan.babu@oracle.com> --- Changelog: V1 -> V2: 1. Add Fixes tag. fs/xfs/xfs_reflink.c | 225 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 187 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..ab7a39244920 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -340,9 +340,41 @@ xfs_find_trim_cow_extent( return 0; } -/* Allocate all CoW reservations covering a range of blocks in a file. */ -int -xfs_reflink_allocate_cow( +static int +xfs_reflink_convert_unwritten( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool convert_now) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + int error; + + /* + * cmap might larger than imap due to cowextsize hint. + */ + xfs_trim_extent(cmap, offset_fsb, count_fsb); + + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || cmap->br_state == XFS_EXT_NORM) + return 0; + + trace_xfs_reflink_convert_cow(ip, cmap); + + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + + return error; +} + +static int +xfs_reflink_alloc_cow_unwritten( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, @@ -351,33 +383,17 @@ xfs_reflink_allocate_cow( bool convert_now) { struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = imap->br_startoff; - xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_trans *tp; - int nimaps, error = 0; - bool found; xfs_filblks_t resaligned; - xfs_extlen_t resblks = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); - if (error || !*shared) - return error; - if (found) - goto convert; + xfs_extlen_t resblks; + int nimaps; + int error; + bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - xfs_iunlock(ip, *lockmode); - *lockmode = 0; - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, false, &tp); if (error) @@ -385,17 +401,17 @@ xfs_reflink_allocate_cow( *lockmode = XFS_ILOCK_EXCL; - /* - * Check for an overlapping extent again now that we dropped the ilock. - */ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; + if (found) { xfs_trans_cancel(tp); goto convert; } + ASSERT(cmap->br_startoff > imap->br_startoff); + /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -415,19 +431,9 @@ xfs_reflink_allocate_cow( */ if (nimaps == 0) return -ENOSPC; + convert: - xfs_trim_extent(cmap, offset_fsb, count_fsb); - /* - * COW fork extents are supposed to remain unwritten until we're ready - * to initiate a disk write. For direct I/O we are going to write the - * data and need the conversion, but for buffered writes we're done. - */ - if (!convert_now || cmap->br_state == XFS_EXT_NORM) - return 0; - trace_xfs_reflink_convert_cow(ip, cmap); - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); - if (!error) - cmap->br_state = XFS_EXT_NORM; + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); return error; out_trans_cancel: @@ -435,6 +441,149 @@ xfs_reflink_allocate_cow( return error; } +static int +xfs_replace_delalloc_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nimaps; + int error; + bool found; + + while (1) { + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, + false, &tp); + if (error) + return error; + + *lockmode = XFS_ILOCK_EXCL; + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, + &found); + if (error || !*shared) + goto out_trans_cancel; + + if (found) { + xfs_trans_cancel(tp); + goto convert; + } + + ASSERT(isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK); + + /* + * Replace delalloc reservation with an unwritten extent. + */ + nimaps = 1; + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, + cmap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, + &nimaps); + if (error) + goto out_trans_cancel; + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + return error; + + /* + * Allocation succeeded but the requested range was not even partially + * satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; + + if (cmap->br_startoff + cmap->br_blockcount > imap->br_startoff) + break; + + xfs_iunlock(ip, *lockmode); + *lockmode = 0; + } + +convert: + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); + return error; +} + + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +int +xfs_reflink_allocate_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + int error; + bool found; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); + if (error || !*shared) + return error; + + /* + * We have to deal with one of the following 2 cases, + * 1. No extent in CoW fork and shared extent in Data fork. + * 2. CoW fork has one of the following: + * - Unwritten/written extent in CoW fork. + * - Delalloc extent in CoW fork; An extent may or may not be present + * in the Data fork. + */ + + if (found) { + /* + * CoW fork has a real extent; Convert it to written if is an + * unwritten extent. + */ + error = xfs_reflink_convert_unwritten(ip, imap, cmap, + convert_now); + return error; + } + + xfs_iunlock(ip, *lockmode); + *lockmode = 0; + + if (cmap->br_startoff > imap->br_startoff) { + /* + * CoW fork does not have an extent. Hence, allocate a real + * extent in the CoW fork. + */ + error = xfs_reflink_alloc_cow_unwritten(ip, imap, cmap, shared, + lockmode, convert_now); + } else if (isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK) { + /* + * CoW fork has a delalloc extent. Replace it with a real + * extent. + */ + error = xfs_replace_delalloc_cow_extent(ip, imap, cmap, shared, + lockmode, convert_now); + } else { + ASSERT(0); + } + + return error; +} + /* * Cancel CoW reservations for some block range of an inode. * -- 2.35.1 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH V2] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork 2022-07-13 10:45 ` [PATCH V2] " Chandan Babu R @ 2022-08-04 17:35 ` Darrick J. Wong 2022-08-05 8:39 ` Chandan Babu R 0 siblings, 1 reply; 5+ messages in thread From: Darrick J. Wong @ 2022-08-04 17:35 UTC (permalink / raw) To: Chandan Babu R; +Cc: linux-xfs, Wengang Wang On Wed, Jul 13, 2022 at 04:15:51PM +0530, Chandan Babu R wrote: > On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error > even though the filesystem has sufficient number of free blocks. Oops, I totally didn't notice this patch. Sorry about that. :( > This occurs if the file offset range on which the write operation is being > performed has a delalloc extent in the cow fork and this delalloc extent > begins much before the Direct IO range. > > In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to > allocate the blocks mapped by the delalloc extent. The extent thus allocated > may not cover the beginning of file offset range on which the Direct IO write > was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC. > > The following script reliably recreates the bug described above. > > #!/usr/bin/bash > > device=/dev/loop0 > shortdev=$(basename $device) > > mntpnt=/mnt/ > file1=${mntpnt}/file1 > file2=${mntpnt}/file2 > fragmentedfile=${mntpnt}/fragmentedfile > punchprog=/root/repos/xfstests-dev/src/punch-alternating > > errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent > > umount $device > /dev/null 2>&1 > > echo "Create FS" > mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 > if [[ $? != 0 ]]; then > echo "mkfs failed." > exit 1 > fi > > echo "Mount FS" > mount $device $mntpnt > /dev/null 2>&1 > if [[ $? != 0 ]]; then > echo "mount failed." > exit 1 > fi > > echo "Create source file" > xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1 > > sync > > echo "Create Reflinked file" > xfs_io -f -c "reflink $file1" $file2 &>/dev/null > > echo "Set cowextsize" > xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1 > > echo "Fragment FS" > xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 > sync > $punchprog $fragmentedfile > > echo "Allocate block sized extent from now onwards" > echo -n 1 > $errortag > > echo "Create 16MiB delalloc extent in CoW fork" > xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1 > > sync > > echo "Direct I/O write at offset 12k" > xfs_io -d -c "pwrite 12k 8k" $file1 > > This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk > blocks are allocated for atleast the starting file offset of the Direct IO > write range. > > Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") > Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com> > Signed-off-by: Chandan Babu R <chandan.babu@oracle.com> > --- > Changelog: > V1 -> V2: > 1. Add Fixes tag. > > fs/xfs/xfs_reflink.c | 225 +++++++++++++++++++++++++++++++++++-------- > 1 file changed, 187 insertions(+), 38 deletions(-) > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index e7a7c00d93be..ab7a39244920 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -340,9 +340,41 @@ xfs_find_trim_cow_extent( > return 0; > } > > -/* Allocate all CoW reservations covering a range of blocks in a file. */ > -int > -xfs_reflink_allocate_cow( > +static int > +xfs_reflink_convert_unwritten( > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool convert_now) > +{ > + xfs_fileoff_t offset_fsb = imap->br_startoff; > + xfs_filblks_t count_fsb = imap->br_blockcount; > + int error; > + > + /* > + * cmap might larger than imap due to cowextsize hint. > + */ > + xfs_trim_extent(cmap, offset_fsb, count_fsb); > + > + /* > + * COW fork extents are supposed to remain unwritten until we're ready > + * to initiate a disk write. For direct I/O we are going to write the > + * data and need the conversion, but for buffered writes we're done. > + */ > + if (!convert_now || cmap->br_state == XFS_EXT_NORM) > + return 0; > + > + trace_xfs_reflink_convert_cow(ip, cmap); > + > + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); > + if (!error) > + cmap->br_state = XFS_EXT_NORM; > + > + return error; > +} > + > +static int > +xfs_reflink_alloc_cow_unwritten( Hmm. If @convert_now is set, then upon successful return, cmap is a written extent, right? I think a better name for this function is xfs_reflink_fill_cow_hole, since that's what it's doing. > struct xfs_inode *ip, > struct xfs_bmbt_irec *imap, > struct xfs_bmbt_irec *cmap, > @@ -351,33 +383,17 @@ xfs_reflink_allocate_cow( > bool convert_now) > { > struct xfs_mount *mp = ip->i_mount; > - xfs_fileoff_t offset_fsb = imap->br_startoff; > - xfs_filblks_t count_fsb = imap->br_blockcount; > struct xfs_trans *tp; > - int nimaps, error = 0; > - bool found; > xfs_filblks_t resaligned; > - xfs_extlen_t resblks = 0; > - > - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > - if (!ip->i_cowfp) { > - ASSERT(!xfs_is_reflink_inode(ip)); > - xfs_ifork_init_cow(ip); > - } > - > - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > - if (error || !*shared) > - return error; > - if (found) > - goto convert; > + xfs_extlen_t resblks; > + int nimaps; > + int error; > + bool found; > > resaligned = xfs_aligned_fsb_count(imap->br_startoff, > imap->br_blockcount, xfs_get_cowextsz_hint(ip)); > resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); > > - xfs_iunlock(ip, *lockmode); > - *lockmode = 0; Hmmm. This piece effectively moves to this function's caller, which means that the parent unlocks the inode and the child maybe relocks it. The ILOCK handling in this whole call chain is already confusing enough as we pass *lockmode around to avoid relocking ILOCK for an xfs_trans_alloc* error return; could we try to contain lock state cycling to single functions, please? IOWs, leave this hunk here and don't add it to xfs_reflink_allocate_cow. I'll have more to say about this in the delalloc conversion function below. > - > error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, > false, &tp); > if (error) > @@ -385,17 +401,17 @@ xfs_reflink_allocate_cow( > > *lockmode = XFS_ILOCK_EXCL; > > - /* > - * Check for an overlapping extent again now that we dropped the ilock. > - */ > error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > if (error || !*shared) > goto out_trans_cancel; > + > if (found) { > xfs_trans_cancel(tp); > goto convert; > } > > + ASSERT(cmap->br_startoff > imap->br_startoff); > + > /* Allocate the entire reservation as unwritten blocks. */ > nimaps = 1; > error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, > @@ -415,19 +431,9 @@ xfs_reflink_allocate_cow( > */ > if (nimaps == 0) > return -ENOSPC; > + > convert: > - xfs_trim_extent(cmap, offset_fsb, count_fsb); > - /* > - * COW fork extents are supposed to remain unwritten until we're ready > - * to initiate a disk write. For direct I/O we are going to write the > - * data and need the conversion, but for buffered writes we're done. > - */ > - if (!convert_now || cmap->br_state == XFS_EXT_NORM) > - return 0; > - trace_xfs_reflink_convert_cow(ip, cmap); > - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); > - if (!error) > - cmap->br_state = XFS_EXT_NORM; > + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); > return error; Nit: "return xfs_reflink_convert_unwritten(...);" > > out_trans_cancel: > @@ -435,6 +441,149 @@ xfs_reflink_allocate_cow( > return error; > } > > +static int > +xfs_replace_delalloc_cow_extent( This is a rather long name, which would get even longer if it were namespaced "xfs_reflink_*" like the rest of the functions in this file. How about xfs_reflink_fill_delalloc? > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool *shared, > + uint *lockmode, > + bool convert_now) > +{ > + struct xfs_mount *mp = ip->i_mount; > + struct xfs_trans *tp; > + int nimaps; > + int error; > + bool found; > + > + while (1) { > + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, > + false, &tp); > + if (error) > + return error; > + > + *lockmode = XFS_ILOCK_EXCL; > + > + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, > + &found); > + if (error || !*shared) > + goto out_trans_cancel; > + > + if (found) { > + xfs_trans_cancel(tp); > + goto convert; Question: If @found is true here, do we need to check that cmap covers at least one block of imap? I /think/ the answer is "no", because @cmap will be set to whatever's in the cow fork at imap->br_startoff, and if it's a real extent (written or not) then there's no delalloc reservation to fill and we can move on to the next step. Also: "break" instead of a goto? > + } > + > + ASSERT(isnullstartblock(cmap->br_startblock) || > + cmap->br_startblock == DELAYSTARTBLOCK); > + > + /* > + * Replace delalloc reservation with an unwritten extent. > + */ > + nimaps = 1; > + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, > + cmap->br_blockcount, > + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, > + &nimaps); Indenting here ^^^^ (two spaces for a continuation, please.) > + if (error) > + goto out_trans_cancel; > + > + xfs_inode_set_cowblocks_tag(ip); > + error = xfs_trans_commit(tp); > + if (error) > + return error; > + > + /* > + * Allocation succeeded but the requested range was not even partially > + * satisfied? Bail out! > + */ > + if (nimaps == 0) > + return -ENOSPC; > + > + if (cmap->br_startoff + cmap->br_blockcount > imap->br_startoff) > + break; > + > + xfs_iunlock(ip, *lockmode); > + *lockmode = 0; > + } If the iunlock in xfs_reflink_allocate_cow were eliminated, then this whole loop could turn into: do { xfs_iunlock(...); xfs_trans_alloc_inode(...); /* stuff */ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); > + > +convert: > + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); > + return error; return xfs_reflink_convert_unwritten(...); > + > +out_trans_cancel: > + xfs_trans_cancel(tp); > + return error; > +} > + > + > +/* Allocate all CoW reservations covering a range of blocks in a file. */ > +int > +xfs_reflink_allocate_cow( > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool *shared, > + uint *lockmode, > + bool convert_now) > +{ > + int error; > + bool found; > + > + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > + if (!ip->i_cowfp) { > + ASSERT(!xfs_is_reflink_inode(ip)); > + xfs_ifork_init_cow(ip); > + } > + > + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > + if (error || !*shared) > + return error; > + > + /* > + * We have to deal with one of the following 2 cases, > + * 1. No extent in CoW fork and shared extent in Data fork. > + * 2. CoW fork has one of the following: > + * - Unwritten/written extent in CoW fork. > + * - Delalloc extent in CoW fork; An extent may or may not be present > + * in the Data fork. > + */ I think this comment is a bit redundant with the hunks below it. > + > + if (found) { > + /* > + * CoW fork has a real extent; Convert it to written if is an > + * unwritten extent. > + */ > + error = xfs_reflink_convert_unwritten(ip, imap, cmap, > + convert_now); > + return error; > + } > + > + xfs_iunlock(ip, *lockmode); > + *lockmode = 0; Move this to xfs_reflink_alloc_cow_unwritten and xfs_replace_delalloc_cow_extent so that we're not unlocking in one function and relocking in another. > + if (cmap->br_startoff > imap->br_startoff) { > + /* > + * CoW fork does not have an extent. Hence, allocate a real > + * extent in the CoW fork. > + */ > + error = xfs_reflink_alloc_cow_unwritten(ip, imap, cmap, shared, > + lockmode, convert_now); > + } else if (isnullstartblock(cmap->br_startblock) || > + cmap->br_startblock == DELAYSTARTBLOCK) { > + /* > + * CoW fork has a delalloc extent. Replace it with a real > + * extent. > + */ > + error = xfs_replace_delalloc_cow_extent(ip, imap, cmap, shared, > + lockmode, convert_now); Also, this would be a bit easier to read if each if case was its own: if (foo) { return XXX; } if (bar) { return YYY; } > + } else { > + ASSERT(0); > + } > + > + return error; > +} > + > /* > * Cancel CoW reservations for some block range of an inode. > * > -- > 2.35.1 > How about this for a v2 patch? --D From: Chandan Babu R <chandan.babu@oracle.com> Subject: [PATCH] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error even though the filesystem has sufficient number of free blocks. This occurs if the file offset range on which the write operation is being performed has a delalloc extent in the cow fork and this delalloc extent begins much before the Direct IO range. In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to allocate the blocks mapped by the delalloc extent. The extent thus allocated may not cover the beginning of file offset range on which the Direct IO write was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC. The following script reliably recreates the bug described above. #!/usr/bin/bash device=/dev/loop0 shortdev=$(basename $device) mntpnt=/mnt/ file1=${mntpnt}/file1 file2=${mntpnt}/file2 fragmentedfile=${mntpnt}/fragmentedfile punchprog=/root/repos/xfstests-dev/src/punch-alternating errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent umount $device > /dev/null 2>&1 echo "Create FS" mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mkfs failed." exit 1 fi echo "Mount FS" mount $device $mntpnt > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mount failed." exit 1 fi echo "Create source file" xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1 sync echo "Create Reflinked file" xfs_io -f -c "reflink $file1" $file2 &>/dev/null echo "Set cowextsize" xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1 echo "Fragment FS" xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 sync $punchprog $fragmentedfile echo "Allocate block sized extent from now onwards" echo -n 1 > $errortag echo "Create 16MiB delalloc extent in CoW fork" xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1 sync echo "Direct I/O write at offset 12k" xfs_io -d -c "pwrite 12k 8k" $file1 This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk blocks are allocated for atleast the starting file offset of the Direct IO write range. Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com> Signed-off-by: Chandan Babu R <chandan.babu@oracle.com> --- fs/xfs/xfs_reflink.c | 201 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 724806c7ce3e..b310cbaebe76 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -341,9 +341,41 @@ xfs_find_trim_cow_extent( return 0; } -/* Allocate all CoW reservations covering a range of blocks in a file. */ -int -xfs_reflink_allocate_cow( +static int +xfs_reflink_convert_unwritten( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool convert_now) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + int error; + + /* + * cmap might larger than imap due to cowextsize hint. + */ + xfs_trim_extent(cmap, offset_fsb, count_fsb); + + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || cmap->br_state == XFS_EXT_NORM) + return 0; + + trace_xfs_reflink_convert_cow(ip, cmap); + + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + + return error; +} + +static int +xfs_reflink_fill_cow_hole( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, @@ -352,25 +384,12 @@ xfs_reflink_allocate_cow( bool convert_now) { struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = imap->br_startoff; - xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_trans *tp; - int nimaps, error = 0; - bool found; xfs_filblks_t resaligned; - xfs_extlen_t resblks = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); - if (error || !*shared) - return error; - if (found) - goto convert; + xfs_extlen_t resblks; + int nimaps; + int error; + bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); @@ -386,17 +405,17 @@ xfs_reflink_allocate_cow( *lockmode = XFS_ILOCK_EXCL; - /* - * Check for an overlapping extent again now that we dropped the ilock. - */ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; + if (found) { xfs_trans_cancel(tp); goto convert; } + ASSERT(cmap->br_startoff > imap->br_startoff); + /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -416,26 +435,138 @@ xfs_reflink_allocate_cow( */ if (nimaps == 0) return -ENOSPC; + convert: - xfs_trim_extent(cmap, offset_fsb, count_fsb); - /* - * COW fork extents are supposed to remain unwritten until we're ready - * to initiate a disk write. For direct I/O we are going to write the - * data and need the conversion, but for buffered writes we're done. - */ - if (!convert_now || cmap->br_state == XFS_EXT_NORM) - return 0; - trace_xfs_reflink_convert_cow(ip, cmap); - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); - if (!error) - cmap->br_state = XFS_EXT_NORM; + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + +out_trans_cancel: + xfs_trans_cancel(tp); return error; +} + +static int +xfs_reflink_fill_delalloc( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nimaps; + int error; + bool found; + + do { + xfs_iunlock(ip, *lockmode); + *lockmode = 0; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, + false, &tp); + if (error) + return error; + + *lockmode = XFS_ILOCK_EXCL; + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, + &found); + if (error || !*shared) + goto out_trans_cancel; + + if (found) { + xfs_trans_cancel(tp); + break; + } + + ASSERT(isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK); + + /* + * Replace delalloc reservation with an unwritten extent. + */ + nimaps = 1; + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, + cmap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, + cmap, &nimaps); + if (error) + goto out_trans_cancel; + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + return error; + + /* + * Allocation succeeded but the requested range was not even + * partially satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; + } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); + + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); out_trans_cancel: xfs_trans_cancel(tp); return error; } +/* Allocate all CoW reservations covering a range of blocks in a file. */ +int +xfs_reflink_allocate_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + int error; + bool found; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); + if (error || !*shared) + return error; + + if (found) { + /* CoW fork has a real extent */ + return xfs_reflink_convert_unwritten(ip, imap, cmap, + convert_now); + } + + if (cmap->br_startoff > imap->br_startoff) { + /* + * CoW fork does not have an extent and data extent is shared. + * Hence, allocate a real extent in the CoW fork. + */ + return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, + lockmode, convert_now); + } + + if (isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK) { + /* + * CoW fork has a delalloc reservation. Replace it with a real + * extent. There may or may not be a data fork mapping. + */ + return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, + lockmode, convert_now); + } + + /* Shouldn't get here. */ + ASSERT(0); + return -EFSCORRUPTED; +} + /* * Cancel CoW reservations for some block range of an inode. * ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH V2] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork 2022-08-04 17:35 ` Darrick J. Wong @ 2022-08-05 8:39 ` Chandan Babu R 0 siblings, 0 replies; 5+ messages in thread From: Chandan Babu R @ 2022-08-05 8:39 UTC (permalink / raw) To: Darrick J. Wong; +Cc: linux-xfs, Wengang Wang On Thu, Aug 04, 2022 at 10:35:09 AM -0700, Darrick J. Wong wrote: > On Wed, Jul 13, 2022 at 04:15:51PM +0530, Chandan Babu R wrote: >> On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error >> even though the filesystem has sufficient number of free blocks. > > Oops, I totally didn't notice this patch. Sorry about that. :( > >> This occurs if the file offset range on which the write operation is being >> performed has a delalloc extent in the cow fork and this delalloc extent >> begins much before the Direct IO range. >> >> In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to >> allocate the blocks mapped by the delalloc extent. The extent thus allocated >> may not cover the beginning of file offset range on which the Direct IO write >> was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC. >> >> The following script reliably recreates the bug described above. >> >> #!/usr/bin/bash >> >> device=/dev/loop0 >> shortdev=$(basename $device) >> >> mntpnt=/mnt/ >> file1=${mntpnt}/file1 >> file2=${mntpnt}/file2 >> fragmentedfile=${mntpnt}/fragmentedfile >> punchprog=/root/repos/xfstests-dev/src/punch-alternating >> >> errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent >> >> umount $device > /dev/null 2>&1 >> >> echo "Create FS" >> mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 >> if [[ $? != 0 ]]; then >> echo "mkfs failed." >> exit 1 >> fi >> >> echo "Mount FS" >> mount $device $mntpnt > /dev/null 2>&1 >> if [[ $? != 0 ]]; then >> echo "mount failed." >> exit 1 >> fi >> >> echo "Create source file" >> xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1 >> >> sync >> >> echo "Create Reflinked file" >> xfs_io -f -c "reflink $file1" $file2 &>/dev/null >> >> echo "Set cowextsize" >> xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1 >> >> echo "Fragment FS" >> xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 >> sync >> $punchprog $fragmentedfile >> >> echo "Allocate block sized extent from now onwards" >> echo -n 1 > $errortag >> >> echo "Create 16MiB delalloc extent in CoW fork" >> xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1 >> >> sync >> >> echo "Direct I/O write at offset 12k" >> xfs_io -d -c "pwrite 12k 8k" $file1 >> >> This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk >> blocks are allocated for atleast the starting file offset of the Direct IO >> write range. >> >> Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") >> Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com> >> Signed-off-by: Chandan Babu R <chandan.babu@oracle.com> >> --- >> Changelog: >> V1 -> V2: >> 1. Add Fixes tag. >> >> fs/xfs/xfs_reflink.c | 225 +++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 187 insertions(+), 38 deletions(-) >> >> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c >> index e7a7c00d93be..ab7a39244920 100644 >> --- a/fs/xfs/xfs_reflink.c >> +++ b/fs/xfs/xfs_reflink.c >> @@ -340,9 +340,41 @@ xfs_find_trim_cow_extent( >> return 0; >> } >> >> -/* Allocate all CoW reservations covering a range of blocks in a file. */ >> -int >> -xfs_reflink_allocate_cow( >> +static int >> +xfs_reflink_convert_unwritten( >> + struct xfs_inode *ip, >> + struct xfs_bmbt_irec *imap, >> + struct xfs_bmbt_irec *cmap, >> + bool convert_now) >> +{ >> + xfs_fileoff_t offset_fsb = imap->br_startoff; >> + xfs_filblks_t count_fsb = imap->br_blockcount; >> + int error; >> + >> + /* >> + * cmap might larger than imap due to cowextsize hint. >> + */ >> + xfs_trim_extent(cmap, offset_fsb, count_fsb); >> + >> + /* >> + * COW fork extents are supposed to remain unwritten until we're ready >> + * to initiate a disk write. For direct I/O we are going to write the >> + * data and need the conversion, but for buffered writes we're done. >> + */ >> + if (!convert_now || cmap->br_state == XFS_EXT_NORM) >> + return 0; >> + >> + trace_xfs_reflink_convert_cow(ip, cmap); >> + >> + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); >> + if (!error) >> + cmap->br_state = XFS_EXT_NORM; >> + >> + return error; >> +} >> + >> +static int >> +xfs_reflink_alloc_cow_unwritten( > > Hmm. If @convert_now is set, then upon successful return, cmap is a > written extent, right? I think a better name for this function is > xfs_reflink_fill_cow_hole, since that's what it's doing. > I agree. I will rename the function. >> struct xfs_inode *ip, >> struct xfs_bmbt_irec *imap, >> struct xfs_bmbt_irec *cmap, >> @@ -351,33 +383,17 @@ xfs_reflink_allocate_cow( >> bool convert_now) >> { >> struct xfs_mount *mp = ip->i_mount; >> - xfs_fileoff_t offset_fsb = imap->br_startoff; >> - xfs_filblks_t count_fsb = imap->br_blockcount; >> struct xfs_trans *tp; >> - int nimaps, error = 0; >> - bool found; >> xfs_filblks_t resaligned; >> - xfs_extlen_t resblks = 0; >> - >> - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); >> - if (!ip->i_cowfp) { >> - ASSERT(!xfs_is_reflink_inode(ip)); >> - xfs_ifork_init_cow(ip); >> - } >> - >> - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); >> - if (error || !*shared) >> - return error; >> - if (found) >> - goto convert; >> + xfs_extlen_t resblks; >> + int nimaps; >> + int error; >> + bool found; >> >> resaligned = xfs_aligned_fsb_count(imap->br_startoff, >> imap->br_blockcount, xfs_get_cowextsz_hint(ip)); >> resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); >> >> - xfs_iunlock(ip, *lockmode); >> - *lockmode = 0; > > Hmmm. This piece effectively moves to this function's caller, which > means that the parent unlocks the inode and the child maybe relocks it. > The ILOCK handling in this whole call chain is already confusing enough > as we pass *lockmode around to avoid relocking ILOCK for an > xfs_trans_alloc* error return; could we try to contain lock state > cycling to single functions, please? > > IOWs, leave this hunk here and don't add it to xfs_reflink_allocate_cow. > I'll have more to say about this in the delalloc conversion function > below. > Yes, this makes the code much easier to read. >> - >> error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, >> false, &tp); >> if (error) >> @@ -385,17 +401,17 @@ xfs_reflink_allocate_cow( >> >> *lockmode = XFS_ILOCK_EXCL; >> >> - /* >> - * Check for an overlapping extent again now that we dropped the ilock. >> - */ >> error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); >> if (error || !*shared) >> goto out_trans_cancel; >> + >> if (found) { >> xfs_trans_cancel(tp); >> goto convert; >> } >> >> + ASSERT(cmap->br_startoff > imap->br_startoff); >> + >> /* Allocate the entire reservation as unwritten blocks. */ >> nimaps = 1; >> error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, >> @@ -415,19 +431,9 @@ xfs_reflink_allocate_cow( >> */ >> if (nimaps == 0) >> return -ENOSPC; >> + >> convert: >> - xfs_trim_extent(cmap, offset_fsb, count_fsb); >> - /* >> - * COW fork extents are supposed to remain unwritten until we're ready >> - * to initiate a disk write. For direct I/O we are going to write the >> - * data and need the conversion, but for buffered writes we're done. >> - */ >> - if (!convert_now || cmap->br_state == XFS_EXT_NORM) >> - return 0; >> - trace_xfs_reflink_convert_cow(ip, cmap); >> - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); >> - if (!error) >> - cmap->br_state = XFS_EXT_NORM; >> + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); >> return error; > > Nit: "return xfs_reflink_convert_unwritten(...);" > Ok. >> >> out_trans_cancel: >> @@ -435,6 +441,149 @@ xfs_reflink_allocate_cow( >> return error; >> } >> >> +static int >> +xfs_replace_delalloc_cow_extent( > > This is a rather long name, which would get even longer if it were > namespaced "xfs_reflink_*" like the rest of the functions in this file. > > How about xfs_reflink_fill_delalloc? > Yes, this is indeed better given that xfs_reflink_ has to prefixed to the function name. >> + struct xfs_inode *ip, >> + struct xfs_bmbt_irec *imap, >> + struct xfs_bmbt_irec *cmap, >> + bool *shared, >> + uint *lockmode, >> + bool convert_now) >> +{ >> + struct xfs_mount *mp = ip->i_mount; >> + struct xfs_trans *tp; >> + int nimaps; >> + int error; >> + bool found; >> + >> + while (1) { >> + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, >> + false, &tp); >> + if (error) >> + return error; >> + >> + *lockmode = XFS_ILOCK_EXCL; >> + >> + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, >> + &found); >> + if (error || !*shared) >> + goto out_trans_cancel; >> + >> + if (found) { >> + xfs_trans_cancel(tp); >> + goto convert; > > Question: If @found is true here, do we need to check that cmap covers > at least one block of imap? I /think/ the answer is "no", because @cmap > will be set to whatever's in the cow fork at imap->br_startoff, and if > it's a real extent (written or not) then there's no delalloc reservation > to fill and we can move on to the next step. Yes. You are right. > > Also: "break" instead of a goto? > Ok. I will replace goto statement with break. >> + } >> + >> + ASSERT(isnullstartblock(cmap->br_startblock) || >> + cmap->br_startblock == DELAYSTARTBLOCK); >> + >> + /* >> + * Replace delalloc reservation with an unwritten extent. >> + */ >> + nimaps = 1; >> + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, >> + cmap->br_blockcount, >> + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, >> + &nimaps); > > Indenting here ^^^^ (two spaces for a continuation, please.) > Sorry about that. >> + if (error) >> + goto out_trans_cancel; >> + >> + xfs_inode_set_cowblocks_tag(ip); >> + error = xfs_trans_commit(tp); >> + if (error) >> + return error; >> + >> + /* >> + * Allocation succeeded but the requested range was not even partially >> + * satisfied? Bail out! >> + */ >> + if (nimaps == 0) >> + return -ENOSPC; >> + >> + if (cmap->br_startoff + cmap->br_blockcount > imap->br_startoff) >> + break; >> + >> + xfs_iunlock(ip, *lockmode); >> + *lockmode = 0; >> + } > > If the iunlock in xfs_reflink_allocate_cow were eliminated, then this > whole loop could turn into: > > do { > xfs_iunlock(...); > xfs_trans_alloc_inode(...); > /* stuff */ > } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); > >> + >> +convert: >> + error = xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); >> + return error; > > return xfs_reflink_convert_unwritten(...); > >> + >> +out_trans_cancel: >> + xfs_trans_cancel(tp); >> + return error; >> +} >> + >> + >> +/* Allocate all CoW reservations covering a range of blocks in a file. */ >> +int >> +xfs_reflink_allocate_cow( >> + struct xfs_inode *ip, >> + struct xfs_bmbt_irec *imap, >> + struct xfs_bmbt_irec *cmap, >> + bool *shared, >> + uint *lockmode, >> + bool convert_now) >> +{ >> + int error; >> + bool found; >> + >> + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); >> + if (!ip->i_cowfp) { >> + ASSERT(!xfs_is_reflink_inode(ip)); >> + xfs_ifork_init_cow(ip); >> + } >> + >> + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); >> + if (error || !*shared) >> + return error; >> + >> + /* >> + * We have to deal with one of the following 2 cases, >> + * 1. No extent in CoW fork and shared extent in Data fork. >> + * 2. CoW fork has one of the following: >> + * - Unwritten/written extent in CoW fork. >> + * - Delalloc extent in CoW fork; An extent may or may not be present >> + * in the Data fork. >> + */ > > I think this comment is a bit redundant with the hunks below it. > I wanted the reader to get an overview about what is to expected from here onwards. But I guess, you are right since each of the individual cases have a comment explaining what is being performed. >> + >> + if (found) { >> + /* >> + * CoW fork has a real extent; Convert it to written if is an >> + * unwritten extent. >> + */ >> + error = xfs_reflink_convert_unwritten(ip, imap, cmap, >> + convert_now); >> + return error; >> + } >> + >> + xfs_iunlock(ip, *lockmode); >> + *lockmode = 0; > > Move this to xfs_reflink_alloc_cow_unwritten and > xfs_replace_delalloc_cow_extent so that we're not unlocking in one > function and relocking in another. > Ok. >> + if (cmap->br_startoff > imap->br_startoff) { >> + /* >> + * CoW fork does not have an extent. Hence, allocate a real >> + * extent in the CoW fork. >> + */ >> + error = xfs_reflink_alloc_cow_unwritten(ip, imap, cmap, shared, >> + lockmode, convert_now); >> + } else if (isnullstartblock(cmap->br_startblock) || >> + cmap->br_startblock == DELAYSTARTBLOCK) { >> + /* >> + * CoW fork has a delalloc extent. Replace it with a real >> + * extent. >> + */ >> + error = xfs_replace_delalloc_cow_extent(ip, imap, cmap, shared, >> + lockmode, convert_now); > > Also, this would be a bit easier to read if each if case was its own: > > if (foo) { > return XXX; > } > if (bar) { > return YYY; > } > I agree. >> + } else { >> + ASSERT(0); >> + } >> + >> + return error; >> +} >> + >> /* >> * Cancel CoW reservations for some block range of an inode. >> * >> -- >> 2.35.1 >> > > How about this for a v2 patch? > > --D > > From: Chandan Babu R <chandan.babu@oracle.com> > Subject: [PATCH] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork > > On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error > even though the filesystem has sufficient number of free blocks. > > This occurs if the file offset range on which the write operation is being > performed has a delalloc extent in the cow fork and this delalloc extent > begins much before the Direct IO range. > > In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to > allocate the blocks mapped by the delalloc extent. The extent thus allocated > may not cover the beginning of file offset range on which the Direct IO write > was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC. > > The following script reliably recreates the bug described above. > > #!/usr/bin/bash > > device=/dev/loop0 > shortdev=$(basename $device) > > mntpnt=/mnt/ > file1=${mntpnt}/file1 > file2=${mntpnt}/file2 > fragmentedfile=${mntpnt}/fragmentedfile > punchprog=/root/repos/xfstests-dev/src/punch-alternating > > errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent > > umount $device > /dev/null 2>&1 > > echo "Create FS" > mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 > if [[ $? != 0 ]]; then > echo "mkfs failed." > exit 1 > fi > > echo "Mount FS" > mount $device $mntpnt > /dev/null 2>&1 > if [[ $? != 0 ]]; then > echo "mount failed." > exit 1 > fi > > echo "Create source file" > xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1 > > sync > > echo "Create Reflinked file" > xfs_io -f -c "reflink $file1" $file2 &>/dev/null > > echo "Set cowextsize" > xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1 > > echo "Fragment FS" > xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 > sync > $punchprog $fragmentedfile > > echo "Allocate block sized extent from now onwards" > echo -n 1 > $errortag > > echo "Create 16MiB delalloc extent in CoW fork" > xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1 > > sync > > echo "Direct I/O write at offset 12k" > xfs_io -d -c "pwrite 12k 8k" $file1 > > This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk > blocks are allocated for atleast the starting file offset of the Direct IO > write range. > > Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") > Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com> > Signed-off-by: Chandan Babu R <chandan.babu@oracle.com> > --- > fs/xfs/xfs_reflink.c | 201 +++++++++++++++++++++++++++++++++++++++++--------- > 1 file changed, 166 insertions(+), 35 deletions(-) > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index 724806c7ce3e..b310cbaebe76 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -341,9 +341,41 @@ xfs_find_trim_cow_extent( > return 0; > } > > -/* Allocate all CoW reservations covering a range of blocks in a file. */ > -int > -xfs_reflink_allocate_cow( > +static int > +xfs_reflink_convert_unwritten( > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool convert_now) > +{ > + xfs_fileoff_t offset_fsb = imap->br_startoff; > + xfs_filblks_t count_fsb = imap->br_blockcount; > + int error; > + > + /* > + * cmap might larger than imap due to cowextsize hint. > + */ > + xfs_trim_extent(cmap, offset_fsb, count_fsb); > + > + /* > + * COW fork extents are supposed to remain unwritten until we're ready > + * to initiate a disk write. For direct I/O we are going to write the > + * data and need the conversion, but for buffered writes we're done. > + */ > + if (!convert_now || cmap->br_state == XFS_EXT_NORM) > + return 0; > + > + trace_xfs_reflink_convert_cow(ip, cmap); > + > + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); > + if (!error) > + cmap->br_state = XFS_EXT_NORM; > + > + return error; > +} > + > +static int > +xfs_reflink_fill_cow_hole( > struct xfs_inode *ip, > struct xfs_bmbt_irec *imap, > struct xfs_bmbt_irec *cmap, > @@ -352,25 +384,12 @@ xfs_reflink_allocate_cow( > bool convert_now) > { > struct xfs_mount *mp = ip->i_mount; > - xfs_fileoff_t offset_fsb = imap->br_startoff; > - xfs_filblks_t count_fsb = imap->br_blockcount; > struct xfs_trans *tp; > - int nimaps, error = 0; > - bool found; > xfs_filblks_t resaligned; > - xfs_extlen_t resblks = 0; > - > - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > - if (!ip->i_cowfp) { > - ASSERT(!xfs_is_reflink_inode(ip)); > - xfs_ifork_init_cow(ip); > - } > - > - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > - if (error || !*shared) > - return error; > - if (found) > - goto convert; > + xfs_extlen_t resblks; > + int nimaps; > + int error; > + bool found; > > resaligned = xfs_aligned_fsb_count(imap->br_startoff, > imap->br_blockcount, xfs_get_cowextsz_hint(ip)); > @@ -386,17 +405,17 @@ xfs_reflink_allocate_cow( > > *lockmode = XFS_ILOCK_EXCL; > > - /* > - * Check for an overlapping extent again now that we dropped the ilock. > - */ > error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > if (error || !*shared) > goto out_trans_cancel; > + > if (found) { > xfs_trans_cancel(tp); > goto convert; > } > > + ASSERT(cmap->br_startoff > imap->br_startoff); > + > /* Allocate the entire reservation as unwritten blocks. */ > nimaps = 1; > error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, > @@ -416,26 +435,138 @@ xfs_reflink_allocate_cow( > */ > if (nimaps == 0) > return -ENOSPC; > + > convert: > - xfs_trim_extent(cmap, offset_fsb, count_fsb); > - /* > - * COW fork extents are supposed to remain unwritten until we're ready > - * to initiate a disk write. For direct I/O we are going to write the > - * data and need the conversion, but for buffered writes we're done. > - */ > - if (!convert_now || cmap->br_state == XFS_EXT_NORM) > - return 0; > - trace_xfs_reflink_convert_cow(ip, cmap); > - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); > - if (!error) > - cmap->br_state = XFS_EXT_NORM; > + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); > + > +out_trans_cancel: > + xfs_trans_cancel(tp); > return error; > +} > + > +static int > +xfs_reflink_fill_delalloc( > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool *shared, > + uint *lockmode, > + bool convert_now) > +{ > + struct xfs_mount *mp = ip->i_mount; > + struct xfs_trans *tp; > + int nimaps; > + int error; > + bool found; > + > + do { > + xfs_iunlock(ip, *lockmode); > + *lockmode = 0; > + > + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, > + false, &tp); > + if (error) > + return error; > + > + *lockmode = XFS_ILOCK_EXCL; > + > + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, > + &found); > + if (error || !*shared) > + goto out_trans_cancel; > + > + if (found) { > + xfs_trans_cancel(tp); > + break; > + } > + > + ASSERT(isnullstartblock(cmap->br_startblock) || > + cmap->br_startblock == DELAYSTARTBLOCK); > + > + /* > + * Replace delalloc reservation with an unwritten extent. > + */ > + nimaps = 1; > + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, > + cmap->br_blockcount, > + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, > + cmap, &nimaps); > + if (error) > + goto out_trans_cancel; > + > + xfs_inode_set_cowblocks_tag(ip); > + error = xfs_trans_commit(tp); > + if (error) > + return error; > + > + /* > + * Allocation succeeded but the requested range was not even > + * partially satisfied? Bail out! > + */ > + if (nimaps == 0) > + return -ENOSPC; > + } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); > + > + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); > > out_trans_cancel: > xfs_trans_cancel(tp); > return error; > } > > +/* Allocate all CoW reservations covering a range of blocks in a file. */ > +int > +xfs_reflink_allocate_cow( > + struct xfs_inode *ip, > + struct xfs_bmbt_irec *imap, > + struct xfs_bmbt_irec *cmap, > + bool *shared, > + uint *lockmode, > + bool convert_now) > +{ > + int error; > + bool found; > + > + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > + if (!ip->i_cowfp) { > + ASSERT(!xfs_is_reflink_inode(ip)); > + xfs_ifork_init_cow(ip); > + } > + > + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); > + if (error || !*shared) > + return error; > + > + if (found) { > + /* CoW fork has a real extent */ > + return xfs_reflink_convert_unwritten(ip, imap, cmap, > + convert_now); > + } > + > + if (cmap->br_startoff > imap->br_startoff) { > + /* > + * CoW fork does not have an extent and data extent is shared. > + * Hence, allocate a real extent in the CoW fork. > + */ > + return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, > + lockmode, convert_now); > + } > + > + if (isnullstartblock(cmap->br_startblock) || > + cmap->br_startblock == DELAYSTARTBLOCK) { > + /* > + * CoW fork has a delalloc reservation. Replace it with a real > + * extent. There may or may not be a data fork mapping. > + */ > + return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, > + lockmode, convert_now); > + } > + > + /* Shouldn't get here. */ > + ASSERT(0); > + return -EFSCORRUPTED; > +} > + > /* > * Cancel CoW reservations for some block range of an inode. > * Looks perfect! Thanks for making the patch better. -- chandan ^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v2] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork
@ 2022-08-06 0:34 Darrick J. Wong
0 siblings, 0 replies; 5+ messages in thread
From: Darrick J. Wong @ 2022-08-06 0:34 UTC (permalink / raw)
To: xfs; +Cc: Chandan Babu R, Chandan Babu R
From: Chandan Babu R <chandan.babu@oracle.com>
On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error
even though the filesystem has sufficient number of free blocks.
This occurs if the file offset range on which the write operation is being
performed has a delalloc extent in the cow fork and this delalloc extent
begins much before the Direct IO range.
In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to
allocate the blocks mapped by the delalloc extent. The extent thus allocated
may not cover the beginning of file offset range on which the Direct IO write
was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC.
The following script reliably recreates the bug described above.
#!/usr/bin/bash
device=/dev/loop0
shortdev=$(basename $device)
mntpnt=/mnt/
file1=${mntpnt}/file1
file2=${mntpnt}/file2
fragmentedfile=${mntpnt}/fragmentedfile
punchprog=/root/repos/xfstests-dev/src/punch-alternating
errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent
umount $device > /dev/null 2>&1
echo "Create FS"
mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1
if [[ $? != 0 ]]; then
echo "mkfs failed."
exit 1
fi
echo "Mount FS"
mount $device $mntpnt > /dev/null 2>&1
if [[ $? != 0 ]]; then
echo "mount failed."
exit 1
fi
echo "Create source file"
xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1
sync
echo "Create Reflinked file"
xfs_io -f -c "reflink $file1" $file2 &>/dev/null
echo "Set cowextsize"
xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1
echo "Fragment FS"
xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1
sync
$punchprog $fragmentedfile
echo "Allocate block sized extent from now onwards"
echo -n 1 > $errortag
echo "Create 16MiB delalloc extent in CoW fork"
xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1
sync
echo "Direct I/O write at offset 12k"
xfs_io -d -c "pwrite 12k 8k" $file1
This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk
blocks are allocated for atleast the starting file offset of the Direct IO
write range.
Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin")
Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: slight editing to make the locking less grody, and fix some style things]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
v2: slight style editing by darrick; if nobody complains, I'll push this
next week...
---
fs/xfs/xfs_reflink.c | 198 +++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 163 insertions(+), 35 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 724806c7ce3e..0a32b54456eb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -341,9 +341,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -352,25 +384,12 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -386,17 +405,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
+ ASSERT(cmap->br_startoff > imap->br_startoff);
+
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -416,26 +435,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
- if (!error)
- cmap->br_state = XFS_EXT_NORM;
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ do {
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ break;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+ cmap, &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even
+ * partially satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /* CoW fork has a real extent */
+ if (found)
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+
+ /*
+ * CoW fork does not have an extent and data extent is shared.
+ * Allocate a real extent in the CoW fork.
+ */
+ if (cmap->br_startoff > imap->br_startoff)
+ return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /*
+ * CoW fork has a delalloc reservation. Replace it with a real extent.
+ * There may or may not be a data fork mapping.
+ */
+ if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK)
+ return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /* Shouldn't get here. */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
^ permalink raw reply related [flat|nested] 5+ messages in threadend of thread, other threads:[~2022-08-06 0:34 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-07-13 7:30 [PATCH] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork Chandan Babu R 2022-07-13 10:45 ` [PATCH V2] " Chandan Babu R 2022-08-04 17:35 ` Darrick J. Wong 2022-08-05 8:39 ` Chandan Babu R -- strict thread matches above, loose matches on Subject: below -- 2022-08-06 0:34 [PATCH v2] " Darrick J. Wong
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox