From mboxrd@z Thu Jan 1 00:00:00 1970 From: Chris Mason Subject: [PATCH RFC] O_DIRECT reads and writes without i_sem Date: Mon, 01 Nov 2004 10:32:07 -0500 Message-ID: <1099323127.23475.80.camel@watt.suse.com> Mime-Version: 1.0 Content-Type: text/plain Content-Transfer-Encoding: 7bit Return-path: Received: from cantor.suse.de ([195.135.220.2]:22756 "EHLO Cantor.suse.de") by vger.kernel.org with ESMTP id S268430AbUKAPe3 (ORCPT ); Mon, 1 Nov 2004 10:34:29 -0500 To: linux-fsdevel@vger.kernel.org, akpm@osdl.org Sender: linux-fsdevel-owner@vger.kernel.org List-Id: linux-fsdevel.vger.kernel.org Hello everyone, Right now, O_DIRECT reads and writes on regular files have to take i_sem while reading file metadata in order to make sure we don't race with hole filling. This patch tries to get around that by avoiding i_sem when we are doing an O_DIRECT read or write inside of i_size. Yet another rw semaphore is added to struct inode to protect against holes being filled during the O_DIRECT. direct-io.c gets another special case to be aware of the locking. This has only been lightly tested, I'm posting here for general comments before I go much further. I'm rounding up some hardware with enough disks to benchmark it properly. Only ext2 and reiserfs are modified to drop i_sem during O_DIRECT. ext3 needs some care around the orphan lists, and I didn't want to get into that until the rest of the patch was working. -chris [patch against 2.6.10-rc1-mm2] Index: linux.mm/fs/direct-io.c =================================================================== --- linux.mm.orig/fs/direct-io.c 2004-11-01 09:34:24.000000000 -0500 +++ linux.mm/fs/direct-io.c 2004-11-01 09:34:40.000000000 -0500 @@ -915,7 +915,7 @@ out: } /* - * Releases both i_sem and i_alloc_sem + * Releases i_sem, i_alloc_sem and i_hole_sem */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, @@ -1065,6 +1065,8 @@ direct_io_worker(int rw, struct kiocb *i */ if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) up(&dio->inode->i_sem); + if ((dio->lock_type == DIO_HOLE_LOCKING)) + up_read(&dio->inode->i_hole_sem); /* * OK, all BIOs are submitted, so we can decrement bio_count to truly @@ -1199,6 +1201,9 @@ __blockdev_direct_IO(int rw, struct kioc * For regular files using DIO_OWN_LOCKING, * both readers and writers need to grab i_alloc_sem * neither readers nor writers hold i_sem on entry (nor exit) + * For regular files using DIO_OWN_LOCKING, + * same as DIO_OWN_LOCKING, but both readers/writers read lock + * i_hole_sem until after get_blocks has finished. */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1214,11 +1219,14 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } down_read(&inode->i_alloc_sem); - if (dio_lock_type == DIO_OWN_LOCKING) + if (dio_lock_type == DIO_OWN_LOCKING || + dio_lock_type == DIO_HOLE_LOCKING) up(&inode->i_sem); } else { down_read(&inode->i_alloc_sem); } + if (dio_lock_type == DIO_HOLE_LOCKING) + down_read(&inode->i_hole_sem); } /* * For file extending writes updating i_size before data Index: linux.mm/fs/ext2/inode.c =================================================================== --- linux.mm.orig/fs/ext2/inode.c 2004-11-01 09:34:24.000000000 -0500 +++ linux.mm/fs/ext2/inode.c 2004-11-01 09:34:40.000000000 -0500 @@ -31,6 +31,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" @@ -649,9 +650,26 @@ ext2_direct_IO(int rw, struct kiocb *ioc { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext2_get_blocks, NULL); + ssize_t ret; + loff_t count = iov_length(iov, nr_segs); + int locking = DIO_HOLE_LOCKING; + + /* + * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING + * unless they are extending the file + */ + if (rw == WRITE) { + if (offset + count > i_size_read(inode)) + locking = DIO_LOCKING; + else + up(&inode->i_sem); + } + ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, ext2_get_blocks, NULL, + locking); + if (rw == WRITE && locking == DIO_HOLE_LOCKING) + down(&inode->i_sem); + return ret; } static int Index: linux.mm/fs/inode.c =================================================================== --- linux.mm.orig/fs/inode.c 2004-11-01 09:34:24.000000000 -0500 +++ linux.mm/fs/inode.c 2004-11-01 09:34:40.000000000 -0500 @@ -197,6 +197,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); init_rwsem(&inode->i_alloc_sem); + init_rwsem(&inode->i_hole_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); Index: linux.mm/fs/reiserfs/file.c =================================================================== --- linux.mm.orig/fs/reiserfs/file.c 2004-11-01 09:34:05.000000000 -0500 +++ linux.mm/fs/reiserfs/file.c 2004-11-01 09:34:40.000000000 -0500 @@ -1249,6 +1249,7 @@ ssize_t reiserfs_file_write( struct file if (res) goto out; + down_write(&inode->i_hole_sem); while ( count > 0) { /* This is the main loop in which we running until some error occures or until we write all of the data. */ @@ -1351,6 +1352,7 @@ ssize_t reiserfs_file_write( struct file count -= write_bytes; balance_dirty_pages_ratelimited(inode->i_mapping); } + up_write(&inode->i_hole_sem); /* this is only true on error */ if (th.t_trans_id) { Index: linux.mm/fs/reiserfs/inode.c =================================================================== --- linux.mm.orig/fs/reiserfs/inode.c 2004-11-01 09:34:05.000000000 -0500 +++ linux.mm/fs/reiserfs/inode.c 2004-11-01 09:34:40.000000000 -0500 @@ -17,6 +17,7 @@ #include #include #include +#include extern int reiserfs_default_io_size; /* default io size devuned in super.c */ @@ -2741,9 +2742,26 @@ static ssize_t reiserfs_direct_IO(int rw { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, reiserfs_get_blocks_direct_io, NULL); + ssize_t ret; + loff_t count = iov_length(iov, nr_segs); + int locking = DIO_HOLE_LOCKING; + + /* + * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING + * unless they are extending the file + */ + if (rw == WRITE) { + if (offset + count > i_size_read(inode)) + locking = DIO_LOCKING; + else + up(&inode->i_sem); + } + ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + reiserfs_get_blocks_direct_io, NULL, locking); + if (rw == WRITE && locking == DIO_HOLE_LOCKING) + down(&inode->i_sem); + return ret; } int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { Index: linux.mm/include/linux/fs.h =================================================================== --- linux.mm.orig/include/linux/fs.h 2004-11-01 09:34:26.000000000 -0500 +++ linux.mm/include/linux/fs.h 2004-11-01 09:34:40.000000000 -0500 @@ -455,6 +455,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; struct rw_semaphore i_alloc_sem; + struct rw_semaphore i_hole_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1575,6 +1576,7 @@ enum { DIO_LOCKING = 1, /* need locking between buffered and direct access */ DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ + DIO_HOLE_LOCKING, /* use i_hole_sem to lock against hole fillers */ }; static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, Index: linux.mm/mm/filemap.c =================================================================== --- linux.mm.orig/mm/filemap.c 2004-11-01 09:34:26.000000000 -0500 +++ linux.mm/mm/filemap.c 2004-11-01 09:38:29.000000000 -0500 @@ -78,6 +78,7 @@ * * ->i_sem * ->i_alloc_sem (various) + * ->i_hole_sem (various) * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) @@ -1927,6 +1928,7 @@ generic_file_buffered_write(struct kiocb pagevec_init(&lru_pvec, 0); + down_write(&inode->i_hole_sem); buf = iov->iov_base + written; /* handle partial DIO write */ do { unsigned long index; @@ -2000,6 +2002,7 @@ generic_file_buffered_write(struct kiocb cond_resched(); } while (count); *ppos = pos; + up_write(&inode->i_hole_sem); if (cached_page) page_cache_release(cached_page);