* [PATCH 1/3] fs: get_blocks needs an unaligned mapping flag
2010-07-23 10:41 [RFC, PATCH 0/3] serialise concurrent direct IO sub-block zeroing Dave Chinner
@ 2010-07-23 10:41 ` Dave Chinner
2010-07-23 15:29 ` Alex Elder
2010-07-23 10:41 ` [PATCH 2/3] xfs: serialise unaligned direct IO into unwritten extents Dave Chinner
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Dave Chinner @ 2010-07-23 10:41 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
When issuing concurrent unaligned direct IO to the same filesystem block, the
direct IO sub-block zeroing code will extend the length of the write being done
when writing into a hole or unwritten extents. If we are writing into unwritten
extents, then the two IOs will both see the extent as unwritten at IO issue
time and attempt to zero the part of the block that they are not writing to.
The result of this is that whichever IO completes last will win and part of the
block will be zero instead of containing the correct data. Eric Sandeen has
demonstrated the problem with xfstest #240. In the case of XFS, we allow
concurrent direct IO writes to occur, but we cannot allow block zeroing to
occur concurrently with other IO.
To allow serialisation of block zeroing across multiple independent IOs, we
need to know if the region being mapped by the IO is fsb-aligned or not. If it
is not aligned, then we need to prevent further direct IO writes from being
executed until the IO that is doing the zeroing completes (i.e. converts the
extent back to written). Passing the fact that the mapping is for an unaligned
IO into the get_blocks calback is sufficient to allow us to implement the
necessary serialisation.
Change the "create" parameter of the get_blocks callback to a flags field,
and define the flags to be backwards compatible as such:
#define GET_BLOCKS_READ 0x00 /* map, no allocation */
#define GET_BLOCKS_CREATE 0x01 /* map, allocate if hole */
#define GET_BLOCKS_UNALIGNED 0x02 /* mapping for unaligned IO */
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/buffer.c | 17 +++++++++--------
fs/direct-io.c | 25 ++++++++++++++++++++++---
fs/ioctl.c | 2 +-
fs/mpage.c | 6 ++++--
include/linux/fs.h | 10 +++++++++-
5 files changed, 45 insertions(+), 15 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index d54812b..eda0395 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1680,7 +1680,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = get_block(inode, block, bh, GET_BLOCKS_CREATE);
if (err)
goto recover;
clear_buffer_delay(bh);
@@ -1869,7 +1869,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = get_block(inode, block, bh, GET_BLOCKS_CREATE);
if (err)
break;
if (buffer_new(bh)) {
@@ -2192,7 +2192,8 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
fully_mapped = 0;
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, iblock, bh, 0);
+ err = get_block(inode, iblock, bh,
+ GET_BLOCKS_READ);
if (err)
SetPageError(page);
}
@@ -2583,9 +2584,9 @@ int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
block_end = block_start + blocksize;
bh->b_state = 0;
- create = 1;
+ create = GET_BLOCKS_CREATE;
if (block_start >= to)
- create = 0;
+ create = GET_BLOCKS_READ;
ret = get_block(inode, block_in_file + block_in_page,
bh, create);
if (ret)
@@ -2816,7 +2817,7 @@ has_buffers:
map_bh.b_size = blocksize;
map_bh.b_state = 0;
- err = get_block(inode, iblock, &map_bh, 0);
+ err = get_block(inode, iblock, &map_bh, GET_BLOCKS_READ);
if (err)
goto unlock;
/* unmapped? It's a hole - nothing to do */
@@ -2893,7 +2894,7 @@ int block_truncate_page(struct address_space *mapping,
err = 0;
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, iblock, bh, 0);
+ err = get_block(inode, iblock, bh, GET_BLOCKS_READ);
if (err)
goto unlock;
/* unmapped? It's a hole - nothing to do */
@@ -2987,7 +2988,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
tmp.b_state = 0;
tmp.b_blocknr = 0;
tmp.b_size = 1 << inode->i_blkbits;
- get_block(inode, block, &tmp, 0);
+ get_block(inode, block, &tmp, GET_BLOCKS_READ);
return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aac..a120a3d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -538,13 +538,33 @@ static int get_more_blocks(struct dio *dio)
dio_count = dio->final_block_in_request - dio->block_in_file;
fs_count = dio_count >> dio->blkfactor;
blkmask = (1 << dio->blkfactor) - 1;
- if (dio_count & blkmask)
+ if (dio_count & blkmask) {
fs_count++;
+ }
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
/*
+ * Because we're mapping full filesystem blocks here to do our
+ * own sub-block zeroing for writes, we need to tell the fs
+ * whether we might be doing zeroing so it can synchronise the
+ * zeroing against other concurrent IO. If we don't do this,
+ * the two concurrent sub-block IO's to the same fsb can race
+ * and zero different portions of the sub-block resulting in
+ * zeros where there should be data. Telling the fs we're doing
+ * unaligned mappings allows it to serialise such IOs and avoid
+ * the race condition.
+ */
+ create = GET_BLOCKS_READ;
+ if (dio->rw & WRITE) {
+ create = GET_BLOCKS_CREATE;
+ if ((dio->block_in_file & blkmask) ||
+ (dio->final_block_in_request & blkmask))
+ create |= GET_BLOCKS_UNALIGNED;
+ }
+
+ /*
* For writes inside i_size on a DIO_SKIP_HOLES filesystem we
* forbid block creations: only overwrites are permitted.
* We will return early to the caller once we see an
@@ -555,11 +575,10 @@ static int get_more_blocks(struct dio *dio)
* which may decide to handle it or also return an unmapped
* buffer head.
*/
- create = dio->rw & WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits))
- create = 0;
+ create = GET_BLOCKS_READ;
}
ret = (*dio->get_block)(dio->inode, fs_startblk,
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a7..3b6c095 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -295,7 +295,7 @@ int __generic_block_fiemap(struct inode *inode,
memset(&map_bh, 0, sizeof(struct buffer_head));
map_bh.b_size = len;
- ret = get_block(inode, start_blk, &map_bh, 0);
+ ret = get_block(inode, start_blk, &map_bh, GET_BLOCKS_READ);
if (ret)
break;
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2..66d2acd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -230,7 +230,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits;
- if (get_block(inode, block_in_file, map_bh, 0))
+ if (get_block(inode, block_in_file, map_bh,
+ GET_BLOCKS_READ))
goto confused;
*first_logical_block = block_in_file;
}
@@ -533,7 +534,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
- if (mpd->get_block(inode, block_in_file, &map_bh, 1))
+ if (mpd->get_block(inode, block_in_file, &map_bh,
+ GET_BLOCKS_CREATE))
goto confused;
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0..a140472 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -412,8 +412,16 @@ extern int dir_notify_enable;
#endif
struct buffer_head;
+
+/* get_blocks callback definition and flags */
+#define GET_BLOCKS_READ 0x00 /* map, no allocation */
+#define GET_BLOCKS_CREATE 0x01 /* map, allocate if hole */
+#define GET_BLOCKS_UNALIGNED 0x02 /* mapping for unaligned IO,
+ only use with GET_BLOCKS_CREATE */
+
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+ struct buffer_head *bh_result, int flags);
+
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 2/3] xfs: serialise unaligned direct IO into unwritten extents
2010-07-23 10:41 [RFC, PATCH 0/3] serialise concurrent direct IO sub-block zeroing Dave Chinner
2010-07-23 10:41 ` [PATCH 1/3] fs: get_blocks needs an unaligned mapping flag Dave Chinner
@ 2010-07-23 10:41 ` Dave Chinner
2010-07-23 15:30 ` Alex Elder
2010-07-23 10:41 ` [PATCH 3/3] xfs: wait on IO completion inside an IO context Dave Chinner
2010-07-23 19:20 ` [RFC, PATCH 0/3] serialise concurrent direct IO sub-block zeroing Eric Sandeen
3 siblings, 1 reply; 11+ messages in thread
From: Dave Chinner @ 2010-07-23 10:41 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
If we do unaligned direct IO to a unwritten extent, the direct Io
code does sub-block zeroing for us. However, we need to serialise
the sub-block zeroing as we canot have another outstanding IO to the
same block that is being zeroed at the same time. Ifw e do so, then
we'll have two direct IOs to the same block that zero different
portions of the block and we'll end up with zeros where we should
have data.
Serialise such IOs in the get_blocks callback when we know that we
are mapping an unaligned direct IO. This initial implementation is
the sledge-hammer approach - we can probably be finer grained than
this to avoid excessive serialisation when all IO is unaligned to a
file.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_aops.c | 47 +++++++++++++++++++++++++++++++++---------
1 files changed, 37 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8abbf05..5682490 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1286,15 +1286,17 @@ __xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create,
+ int flags,
int direct)
{
- int flags = create ? BMAPI_WRITE : BMAPI_READ;
+ int create = flags & GET_BLOCKS_CREATE;
+ int bflags = create ? BMAPI_WRITE : BMAPI_READ;
struct xfs_bmbt_irec imap;
xfs_off_t offset;
ssize_t size;
- int nimap = 1;
- int new = 0;
+ int nimap;
+ int new;
+ int iolock_changed = 0;
int error;
offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1305,15 +1307,40 @@ __xfs_get_blocks(
return 0;
if (direct && create)
- flags |= BMAPI_DIRECT;
+ bflags |= BMAPI_DIRECT;
- error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+remap:
+ memset(&imap, 0, sizeof(imap));
+ nimap = 1;
+ new = 0;
+ error = xfs_iomap(XFS_I(inode), offset, size, bflags, &imap, &nimap,
&new);
if (error)
return -error;
if (nimap == 0)
return 0;
+ /*
+ * If this is an unwritten extent and we are doing unaligned direct IO
+ * to it, we need to serialise the sub-block zeroing across multiple
+ * concurrent IOs. As a brute-force way of doing that, promote the
+ * IOLOCK we hold to exclusive to prevent new IO from being issued,
+ * wait for all the current IO to drain (and potentially convert the
+ * unwritten extents) and remap the extent once all the IO has drained.
+ * Then we can demote the IOLOCK back to shared and proceed with the
+ * IO.
+ */
+ if (direct && create && ISUNWRITTEN(&imap) &&
+ (flags & GET_BLOCKS_UNALIGNED) && !iolock_changed) {
+ xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
+ xfs_ilock(XFS_I(inode), XFS_IOLOCK_EXCL);
+ xfs_ioend_wait(XFS_I(inode));
+ iolock_changed = 1;
+ goto remap;
+ }
+ if (iolock_changed)
+ xfs_ilock_demote(XFS_I(inode), XFS_IOLOCK_EXCL);
+
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK) {
/*
@@ -1386,9 +1413,9 @@ xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create)
+ int flags)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+ return __xfs_get_blocks(inode, iblock, bh_result, flags, 0);
}
STATIC int
@@ -1396,9 +1423,9 @@ xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create)
+ int flags)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+ return __xfs_get_blocks(inode, iblock, bh_result, flags, 1);
}
STATIC void
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 3/3] xfs: wait on IO completion inside an IO context
2010-07-23 10:41 [RFC, PATCH 0/3] serialise concurrent direct IO sub-block zeroing Dave Chinner
2010-07-23 10:41 ` [PATCH 1/3] fs: get_blocks needs an unaligned mapping flag Dave Chinner
2010-07-23 10:41 ` [PATCH 2/3] xfs: serialise unaligned direct IO into unwritten extents Dave Chinner
@ 2010-07-23 10:41 ` Dave Chinner
2010-07-23 15:30 ` Alex Elder
2010-07-23 19:20 ` [RFC, PATCH 0/3] serialise concurrent direct IO sub-block zeroing Eric Sandeen
3 siblings, 1 reply; 11+ messages in thread
From: Dave Chinner @ 2010-07-23 10:41 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
To wait while IOs drain from the inode while inside an ioend context, we have
to wait until the inode io count drops to 1 - the reference we hold - rather
than zero. Add functionality to the ioend wait subsystem to do this.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/linux-2.6/xfs_aops.c | 50 +++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_inode.h | 13 ++++++-----
2 files changed, 55 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 5682490..ec499f2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -64,6 +64,9 @@ xfs_ioend_init(void)
init_waitqueue_head(&xfs_ioend_wq[i]);
}
+/*
+ * wait for all IO to drain from the inode
+ */
void
xfs_ioend_wait(
xfs_inode_t *ip)
@@ -73,12 +76,55 @@ xfs_ioend_wait(
wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
}
+/*
+ * If we have an active ioend in the caller context (e.g.
+ * xfs_get_blocks_direct) we need to wait for the count to drop to one before
+ * we are woken.
+ *
+ * For this to work in the context of concurrent callers (concurrent direct IO),
+ * this function must be called with the iolock held exclusively to prevent
+ * other IOs blocking here and preventing the count from ever dropping to 1.
+ */
+STATIC void
+xfs_ioend_wait_excl(
+ xfs_inode_t *ip)
+{
+ wait_queue_head_t *wq = to_ioend_wq(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_iflags_set(ip, XFS_IIOEND_WAIT_EXCL);
+ wait_event(*wq, (atomic_read(&ip->i_iocount) == 1));
+ xfs_iflags_clear(ip, XFS_IIOEND_WAIT_EXCL);
+}
+
STATIC void
xfs_ioend_wake(
xfs_inode_t *ip)
{
- if (atomic_dec_and_test(&ip->i_iocount))
+ if (atomic_dec_and_test(&ip->i_iocount)) {
+ wake_up(to_ioend_wq(ip));
+ return;
+ }
+
+ /*
+ * do an unlocked check for an exclusive wait before trying to get
+ * spinlocks to avoid hurting the normal path too much. We can do this
+ * check unlocked because if the flag is not set here and this is the
+ * last IO remaining (i.e. iocount == 1 after the above decrement),
+ * then any code that enters xfs_ioend_wait_excl() will now see that
+ * i_iocount == 1 and return immediately. Hence we don't need to issue
+ * a wakeup in this case, and it keeps the common case overhead as low
+ * as possible.
+ */
+ smp_rmb();
+ if (!__xfs_iflags_test(ip, XFS_IIOEND_WAIT_EXCL))
+ return;
+
+ spin_lock(&ip->i_flags_lock);
+ if (atomic_read(&ip->i_iocount) == 1 &&
+ __xfs_iflags_test(ip, XFS_IIOEND_WAIT_EXCL))
wake_up(to_ioend_wq(ip));
+ spin_unlock(&ip->i_flags_lock);
}
void
@@ -1334,7 +1380,7 @@ remap:
(flags & GET_BLOCKS_UNALIGNED) && !iolock_changed) {
xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
xfs_ilock(XFS_I(inode), XFS_IOLOCK_EXCL);
- xfs_ioend_wait(XFS_I(inode));
+ xfs_ioend_wait_excl(XFS_I(inode));
iolock_changed = 1;
goto remap;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c54..40b5203 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -357,12 +357,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
+#define XFS_IRECLAIM 0x0001 /* have started reclaiming this inode */
+#define XFS_ISTALE 0x0002 /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW 0x0008 /* inode has just been allocated */
+#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
+#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
+#define XFS_IIOEND_WAIT_EXCL 0x0040 /* io completion waiter in io context */
/*
* Flags for inode locking.
--
1.7.1
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 11+ messages in thread