From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dave Chinner Subject: [PATCH] dio: track and serialise unaligned direct IO Date: Fri, 30 Jul 2010 08:45:16 +1000 Message-ID: <1280443516-14448-1-git-send-email-david@fromorbit.com> Mime-Version: 1.0 Content-Type: TEXT/PLAIN; charset=ISO-8859-1 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: xfs@oss.sgi.com, sandeen@sandeen.net To: linux-fsdevel@vger.kernel.org Return-path: Received: from bld-mail18.adl2.internode.on.net ([150.101.137.103]:42192 "EHLO mail.internode.on.net" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1757389Ab0G2Wpg (ORCPT ); Thu, 29 Jul 2010 18:45:36 -0400 Sender: linux-fsdevel-owner@vger.kernel.org List-ID: =46rom: Dave Chinner If we get two unaligned direct IO's to the same filesystem block that is marked as a new allocation (i.e. buffer_new), then both IOs wil= l zero the portion of the block they are not writing data to. As a result, when the IOs complete there will be a portion of the block that contains zeros from the last IO to complete rather than the data that should be there. This is easily manifested by qemu using aio+dio with an unaligned guest filesystem - every IO is unaligned and fileystem corruption is encountered in the guest filesystem. xfstest 240 (from Eric Sandeen) is also a simple reproducer. To avoid this problem, track unaligned IO that triggers sub-block zeroi= ng and check new incoming unaligned IO that require sub-block zeroing against = that list. If we get an overlap where the start and end of unaligned IOs hit= the same filesystem block, then we need to block the incoming IOs until the= IO that is zeroing the block completes. The blocked IO can then continue withou= t needing to do any zeroing and hence won't overwrite valid data with zer= os. Signed-off-by: Dave Chinner --- fs/direct-io.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++= +++++-- 1 files changed, 146 insertions(+), 6 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index a10cb91..611524e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,6 +71,9 @@ struct dio { unsigned start_zero_done; /* flag: sub-blocksize zeroing has been performed at the start of a write */ +#define LAST_SECTOR ((sector_t)-1LL) + sector_t zero_block_front; /* fs block we are zeroing at front */ + sector_t zero_block_rear; /* fs block we are zeroing at rear */ int pages_in_io; /* approximate total IO pages */ size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying @@ -135,6 +138,101 @@ struct dio { struct page *pages[DIO_PAGES]; /* page buffer */ }; =20 + +/* + * record fs blocks we are doing zeroing on in a zero block list. + * unaligned IO is not very performant and so is relatively uncommon, + * so a global list should be sufficent to track them. + */ +struct dio_zero_block { + struct list_head dio_list; /* list of io in progress */ + sector_t zero_block; /* block being zeroed */ + struct dio *dio; /* owner dio */ + wait_queue_head_t wq; /* New IO block here */ + atomic_t ref; /* reference count */ +}; + +DEFINE_SPINLOCK(dio_zero_block_lock); +LIST_HEAD(dio_zero_block_list); + +/* + * Add a filesystem block to the list of blocks we are tracking. + */ +static void +dio_start_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + zb =3D kmalloc(sizeof(*zb), GFP_NOIO); + if (!zb) + return; + INIT_LIST_HEAD(&zb->dio_list); + init_waitqueue_head(&zb->wq); + zb->zero_block =3D zero_block; + zb->dio =3D dio; + atomic_set(&zb->ref, 1); + + spin_lock(&dio_zero_block_lock); + list_add(&zb->dio_list, &dio_zero_block_list); + spin_unlock(&dio_zero_block_lock); +} + +static void +dio_drop_zero_block(struct dio_zero_block *zb) +{ + if (atomic_dec_and_test(&zb->ref)) + kfree(zb); +} + +/* + * Check whether a filesystem block is currently being zeroed, and if = it is + * wait for it to complete before returning. If we waited for a block = being + * zeroed, return 1 to indicate that the block is already initialised, + * otherwise return 0 to indicate that it needs zeroing. + */ +static int +dio_wait_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + spin_lock(&dio_zero_block_lock); + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { + if (zb->dio->inode !=3D dio->inode) + continue; + if (zb->zero_block !=3D zero_block) + continue; + atomic_inc(&zb->ref); + spin_unlock(&dio_zero_block_lock); + wait_event(zb->wq, (list_empty(&zb->dio_list))); + dio_drop_zero_block(zb); + return 1; + } + spin_unlock(&dio_zero_block_lock); + return 0; +} + +/* + * Complete a block zeroing and wake up anyone waiting for it. + */ +static void dio_end_zero_block(struct dio *dio, sector_t zero_block) +{ + struct dio_zero_block *zb; + + spin_lock(&dio_zero_block_lock); + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { + if (zb->dio->inode !=3D dio->inode) + continue; + if (zb->zero_block !=3D zero_block) + continue; + list_del_init(&zb->dio_list); + spin_unlock(&dio_zero_block_lock); + wake_up(&zb->wq); + dio_drop_zero_block(zb); + return; + } + spin_unlock(&dio_zero_block_lock); +} + /* * How many pages are in the queue? */ @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t of= fset, int ret, bool is_async) aio_complete(dio->iocb, ret, 0); } =20 + if (dio->zero_block_front !=3D LAST_SECTOR) + dio_end_zero_block(dio, dio->zero_block_front); + if (dio->zero_block_rear !=3D LAST_SECTOR) + dio_end_zero_block(dio, dio->zero_block_rear); + if (dio->flags & DIO_LOCKING) /* lockdep: non-owner release */ up_read_non_owner(&dio->inode->i_alloc_sem); @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio= ) * block with zeros. This happens only if user-buffer, fileoffset or * io length is not filesystem block-size multiple. * + * We need to track the blocks we are zeroing. If we have concurrent I= Os that hit + * the same start or end block, we do not want all the IOs to zero the= portion + * they are not writing data to as that will overwrite data from the o= ther IOs. + * Hence we need to block until the first unaligned IO completes befor= e we can + * continue (without executing any zeroing). + * * `end' is zero if we're doing the start of the IO, 1 at the end of t= he * IO. */ @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end= ) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ - unsigned this_chunk_bytes; struct page *page; + sector_t fsblock; =20 dio->start_zero_done =3D 1; if (!dio->blkfactor || !buffer_new(&dio->map_bh)) @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int e= nd) if (!this_chunk_blocks) return; =20 + if (end) + this_chunk_blocks =3D dio_blocks_per_fs_block - this_chunk_blocks; + /* * We need to zero out part of an fs block. It is either at the - * beginning or the end of the fs block. + * beginning or the end of the fs block, but first we need to check i= f + * there is already a zeroing being run on this block. + * + * If we are doing a sub-block IO (i.e. zeroing both front and rear o= f + * the same block) we don't need to wait or set a gaurd for the rear = of + * the block as we already have one set. */ - if (end)=20 - this_chunk_blocks =3D dio_blocks_per_fs_block - this_chunk_blocks; + fsblock =3D dio->block_in_file >> dio->blkfactor; + if (!end || dio->zero_block_front !=3D fsblock) { =20 - this_chunk_bytes =3D this_chunk_blocks << dio->blkbits; + /* wait for any zeroing already in progress */ + if (dio_wait_zero_block(dio, fsblock)) { + /* skip the range we would have zeroed. */ + dio->next_block_for_io +=3D this_chunk_blocks; + return; + } + + /* + * we are going to zero stuff now, so set a guard to catch + * others that might want to zero the same block. + */ + dio_start_zero_block(dio, fsblock); + if (end) + dio->zero_block_rear =3D fsblock; + else + dio->zero_block_front =3D fsblock; + } =20 page =3D ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes,=20 + if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbi= ts, dio->next_block_for_io)) return; =20 @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kio= cb *iocb, struct inode *inode, */ memset(dio, 0, offsetof(struct dio, pages)); =20 + /* + * zero_blocks need to initialised to large=D1=95t value to avoid + * matching the zero block accidentally. + */ + dio->zero_block_front =3D LAST_SECTOR; + dio->zero_block_rear =3D LAST_SECTOR; + dio->flags =3D flags; if (dio->flags & DIO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ --=20 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel= " in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html