From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dave Chinner Subject: [PATCH 1/2] dio: track and serialise unaligned direct IO Date: Mon, 2 Aug 2010 17:25:44 +1000 Message-ID: <1280733945-16231-2-git-send-email-david@fromorbit.com> References: <1280733945-16231-1-git-send-email-david@fromorbit.com> Mime-Version: 1.0 Content-Type: TEXT/PLAIN; charset=ISO-8859-1 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: xfs@oss.sgi.com, sandeen@sandeen.net To: linux-fsdevel@vger.kernel.org Return-path: Received: from bld-mail17.adl2.internode.on.net ([150.101.137.102]:43893 "EHLO mail.internode.on.net" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1753062Ab0HBH0X (ORCPT ); Mon, 2 Aug 2010 03:26:23 -0400 In-Reply-To: <1280733945-16231-1-git-send-email-david@fromorbit.com> Sender: linux-fsdevel-owner@vger.kernel.org List-ID: =46rom: Dave Chinner If we get two unaligned direct IO's to the same filesystem block that is marked as a new allocation (i.e. buffer_new), then both IOs will zero the portion of the block they are not writing data to. As a result, when the IOs complete there will be a portion of the block that contains zeros from the last IO to complete rather than the data that should be there. This is easily manifested by qemu using aio+dio with an unaligned guest filesystem - every IO is unaligned and fileystem corruption is encountered in the guest filesystem. xfstest 240 (from Eric Sandeen) is also a simple reproducer. To avoid this problem, track unaligned IO that triggers sub-block zeroing and check new incoming unaligned IO that require sub-block zeroing against that list. If we get an overlap where the start and end of unaligned IOs hit the same filesystem block, then we need to block the incoming IOs until the IO that is zeroing the block completes. The blocked IO can then continue without needing to do any zeroing and hence won't overwrite valid data with zeros. Signed-off-by: Dave Chinner --- fs/direct-io.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++= +++++-- 1 files changed, 146 insertions(+), 6 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index a10cb91..611524e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,6 +71,9 @@ struct dio { unsigned start_zero_done; /* flag: sub-blocksize zeroing has been performed at the start of a write */ +#define LAST_SECTOR ((sector_t)-1LL) + sector_t zero_block_front; /* fs block we are zeroing at front */ + sector_t zero_block_rear; /* fs block we are zeroing at rear */ int pages_in_io; /* approximate total IO pages */ size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying @@ -135,6 +138,101 @@ struct dio { struct page *pages[DIO_PAGES]; /* page buffer */ }; =20 + +/* + * record fs blocks we are doing zeroing on in a zero block list. + * unaligned IO is not very performant and so is relatively uncommon, + * so a global list should be sufficent to track them. + */ +struct dio_zero_block { + struct list_head dio_list; /* list of io in progress */ + sector_t zero_block; /* block being zeroed */ + struct dio *dio; /* owner dio */ + wait_queue_head_t wq; /* New IO block here */ + atomic_t ref; /* reference count */ +}; + +DEFINE_SPINLOCK(dio_zero_block_lock); +LIST_HEAD(dio_zero_block_list); + +/* + * Add a filesystem block to the list of blocks we are tracking. + */ +static void +dio_start_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + zb =3D kmalloc(sizeof(*zb), GFP_NOIO); + if (!zb) + return; + INIT_LIST_HEAD(&zb->dio_list); + init_waitqueue_head(&zb->wq); + zb->zero_block =3D zero_block; + zb->dio =3D dio; + atomic_set(&zb->ref, 1); + + spin_lock(&dio_zero_block_lock); + list_add(&zb->dio_list, &dio_zero_block_list); + spin_unlock(&dio_zero_block_lock); +} + +static void +dio_drop_zero_block(struct dio_zero_block *zb) +{ + if (atomic_dec_and_test(&zb->ref)) + kfree(zb); +} + +/* + * Check whether a filesystem block is currently being zeroed, and if = it is + * wait for it to complete before returning. If we waited for a block = being + * zeroed, return 1 to indicate that the block is already initialised, + * otherwise return 0 to indicate that it needs zeroing. + */ +static int +dio_wait_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + spin_lock(&dio_zero_block_lock); + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { + if (zb->dio->inode !=3D dio->inode) + continue; + if (zb->zero_block !=3D zero_block) + continue; + atomic_inc(&zb->ref); + spin_unlock(&dio_zero_block_lock); + wait_event(zb->wq, (list_empty(&zb->dio_list))); + dio_drop_zero_block(zb); + return 1; + } + spin_unlock(&dio_zero_block_lock); + return 0; +} + +/* + * Complete a block zeroing and wake up anyone waiting for it. + */ +static void dio_end_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + spin_lock(&dio_zero_block_lock); + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { + if (zb->dio->inode !=3D dio->inode) + continue; + if (zb->zero_block !=3D zero_block) + continue; + list_del_init(&zb->dio_list); + spin_unlock(&dio_zero_block_lock); + wake_up(&zb->wq); + dio_drop_zero_block(zb); + return; + } + spin_unlock(&dio_zero_block_lock); +} + /* * How many pages are in the queue? */ @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t of= fset, int ret, bool is_async) aio_complete(dio->iocb, ret, 0); } =20 + if (dio->zero_block_front !=3D LAST_SECTOR) + dio_end_zero_block(dio, dio->zero_block_front); + if (dio->zero_block_rear !=3D LAST_SECTOR) + dio_end_zero_block(dio, dio->zero_block_rear); + if (dio->flags & DIO_LOCKING) /* lockdep: non-owner release */ up_read_non_owner(&dio->inode->i_alloc_sem); @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio= ) * block with zeros. This happens only if user-buffer, fileoffset or * io length is not filesystem block-size multiple. * + * We need to track the blocks we are zeroing. If we have concurrent I= Os that hit + * the same start or end block, we do not want all the IOs to zero the= portion + * they are not writing data to as that will overwrite data from the o= ther IOs. + * Hence we need to block until the first unaligned IO completes befor= e we can + * continue (without executing any zeroing). + * * `end' is zero if we're doing the start of the IO, 1 at the end of t= he * IO. */ @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end= ) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ - unsigned this_chunk_bytes; struct page *page; + sector_t fsblock; =20 dio->start_zero_done =3D 1; if (!dio->blkfactor || !buffer_new(&dio->map_bh)) @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int e= nd) if (!this_chunk_blocks) return; =20 + if (end) + this_chunk_blocks =3D dio_blocks_per_fs_block - this_chunk_blocks; + /* * We need to zero out part of an fs block. It is either at the - * beginning or the end of the fs block. + * beginning or the end of the fs block, but first we need to check i= f + * there is already a zeroing being run on this block. + * + * If we are doing a sub-block IO (i.e. zeroing both front and rear o= f + * the same block) we don't need to wait or set a gaurd for the rear = of + * the block as we already have one set. */ - if (end)=20 - this_chunk_blocks =3D dio_blocks_per_fs_block - this_chunk_blocks; + fsblock =3D dio->block_in_file >> dio->blkfactor; + if (!end || dio->zero_block_front !=3D fsblock) { =20 - this_chunk_bytes =3D this_chunk_blocks << dio->blkbits; + /* wait for any zeroing already in progress */ + if (dio_wait_zero_block(dio, fsblock)) { + /* skip the range we would have zeroed. */ + dio->next_block_for_io +=3D this_chunk_blocks; + return; + } + + /* + * we are going to zero stuff now, so set a guard to catch + * others that might want to zero the same block. + */ + dio_start_zero_block(dio, fsblock); + if (end) + dio->zero_block_rear =3D fsblock; + else + dio->zero_block_front =3D fsblock; + } =20 page =3D ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes,=20 + if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbi= ts, dio->next_block_for_io)) return; =20 @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kio= cb *iocb, struct inode *inode, */ memset(dio, 0, offsetof(struct dio, pages)); =20 + /* + * zero_blocks need to initialised to large=D1=95t value to avoid + * matching the zero block accidentally. + */ + dio->zero_block_front =3D LAST_SECTOR; + dio->zero_block_rear =3D LAST_SECTOR; + dio->flags =3D flags; if (dio->flags & DIO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ --=20 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel= " in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html