* [PATCH RFC] O_DIRECT reads and writes without i_sem
@ 2004-11-01 15:32 Chris Mason
2004-11-01 16:08 ` Christoph Hellwig
0 siblings, 1 reply; 4+ messages in thread
From: Chris Mason @ 2004-11-01 15:32 UTC (permalink / raw)
To: linux-fsdevel, akpm
Hello everyone,
Right now, O_DIRECT reads and writes on regular files have to take i_sem
while reading file metadata in order to make sure we don't race with
hole filling.
This patch tries to get around that by avoiding i_sem when we are doing
an O_DIRECT read or write inside of i_size. Yet another rw semaphore is
added to struct inode to protect against holes being filled during the
O_DIRECT. direct-io.c gets another special case to be aware of the
locking.
This has only been lightly tested, I'm posting here for general comments
before I go much further. I'm rounding up some hardware with enough
disks to benchmark it properly.
Only ext2 and reiserfs are modified to drop i_sem during O_DIRECT. ext3
needs some care around the orphan lists, and I didn't want to get into
that until the rest of the patch was working.
-chris
[patch against 2.6.10-rc1-mm2]
Index: linux.mm/fs/direct-io.c
===================================================================
--- linux.mm.orig/fs/direct-io.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/direct-io.c 2004-11-01 09:34:40.000000000 -0500
@@ -915,7 +915,7 @@ out:
}
/*
- * Releases both i_sem and i_alloc_sem
+ * Releases i_sem, i_alloc_sem and i_hole_sem
*/
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1065,6 +1065,8 @@ direct_io_worker(int rw, struct kiocb *i
*/
if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
up(&dio->inode->i_sem);
+ if ((dio->lock_type == DIO_HOLE_LOCKING))
+ up_read(&dio->inode->i_hole_sem);
/*
* OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1199,6 +1201,9 @@ __blockdev_direct_IO(int rw, struct kioc
* For regular files using DIO_OWN_LOCKING,
* both readers and writers need to grab i_alloc_sem
* neither readers nor writers hold i_sem on entry (nor exit)
+ * For regular files using DIO_OWN_LOCKING,
+ * same as DIO_OWN_LOCKING, but both readers/writers read lock
+ * i_hole_sem until after get_blocks has finished.
*/
dio->lock_type = dio_lock_type;
if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1214,11 +1219,14 @@ __blockdev_direct_IO(int rw, struct kioc
goto out;
}
down_read(&inode->i_alloc_sem);
- if (dio_lock_type == DIO_OWN_LOCKING)
+ if (dio_lock_type == DIO_OWN_LOCKING ||
+ dio_lock_type == DIO_HOLE_LOCKING)
up(&inode->i_sem);
} else {
down_read(&inode->i_alloc_sem);
}
+ if (dio_lock_type == DIO_HOLE_LOCKING)
+ down_read(&inode->i_hole_sem);
}
/*
* For file extending writes updating i_size before data
Index: linux.mm/fs/ext2/inode.c
===================================================================
--- linux.mm.orig/fs/ext2/inode.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/ext2/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "acl.h"
@@ -649,9 +650,26 @@ ext2_direct_IO(int rw, struct kiocb *ioc
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, ext2_get_blocks, NULL);
+ ssize_t ret;
+ loff_t count = iov_length(iov, nr_segs);
+ int locking = DIO_HOLE_LOCKING;
+
+ /*
+ * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING
+ * unless they are extending the file
+ */
+ if (rw == WRITE) {
+ if (offset + count > i_size_read(inode))
+ locking = DIO_LOCKING;
+ else
+ up(&inode->i_sem);
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs, ext2_get_blocks, NULL,
+ locking);
+ if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+ down(&inode->i_sem);
+ return ret;
}
static int
Index: linux.mm/fs/inode.c
===================================================================
--- linux.mm.orig/fs/inode.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -197,6 +197,7 @@ void inode_init_once(struct inode *inode
INIT_LIST_HEAD(&inode->i_devices);
sema_init(&inode->i_sem, 1);
init_rwsem(&inode->i_alloc_sem);
+ init_rwsem(&inode->i_hole_sem);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
rwlock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
Index: linux.mm/fs/reiserfs/file.c
===================================================================
--- linux.mm.orig/fs/reiserfs/file.c 2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/file.c 2004-11-01 09:34:40.000000000 -0500
@@ -1249,6 +1249,7 @@ ssize_t reiserfs_file_write( struct file
if (res)
goto out;
+ down_write(&inode->i_hole_sem);
while ( count > 0) {
/* This is the main loop in which we running until some error occures
or until we write all of the data. */
@@ -1351,6 +1352,7 @@ ssize_t reiserfs_file_write( struct file
count -= write_bytes;
balance_dirty_pages_ratelimited(inode->i_mapping);
}
+ up_write(&inode->i_hole_sem);
/* this is only true on error */
if (th.t_trans_id) {
Index: linux.mm/fs/reiserfs/inode.c
===================================================================
--- linux.mm.orig/fs/reiserfs/inode.c 2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -17,6 +17,7 @@
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/quotaops.h>
+#include <linux/uio.h>
extern int reiserfs_default_io_size; /* default io size devuned in super.c */
@@ -2741,9 +2742,26 @@ static ssize_t reiserfs_direct_IO(int rw
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+ ssize_t ret;
+ loff_t count = iov_length(iov, nr_segs);
+ int locking = DIO_HOLE_LOCKING;
+
+ /*
+ * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING
+ * unless they are extending the file
+ */
+ if (rw == WRITE) {
+ if (offset + count > i_size_read(inode))
+ locking = DIO_LOCKING;
+ else
+ up(&inode->i_sem);
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iov, offset, nr_segs,
+ reiserfs_get_blocks_direct_io, NULL, locking);
+ if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+ down(&inode->i_sem);
+ return ret;
}
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
Index: linux.mm/include/linux/fs.h
===================================================================
--- linux.mm.orig/include/linux/fs.h 2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/include/linux/fs.h 2004-11-01 09:34:40.000000000 -0500
@@ -455,6 +455,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
struct semaphore i_sem;
struct rw_semaphore i_alloc_sem;
+ struct rw_semaphore i_hole_sem;
struct inode_operations *i_op;
struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb;
@@ -1575,6 +1576,7 @@ enum {
DIO_LOCKING = 1, /* need locking between buffered and direct access */
DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+ DIO_HOLE_LOCKING, /* use i_hole_sem to lock against hole fillers */
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
Index: linux.mm/mm/filemap.c
===================================================================
--- linux.mm.orig/mm/filemap.c 2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/mm/filemap.c 2004-11-01 09:38:29.000000000 -0500
@@ -78,6 +78,7 @@
*
* ->i_sem
* ->i_alloc_sem (various)
+ * ->i_hole_sem (various)
*
* ->inode_lock
* ->sb_lock (fs/fs-writeback.c)
@@ -1927,6 +1928,7 @@ generic_file_buffered_write(struct kiocb
pagevec_init(&lru_pvec, 0);
+ down_write(&inode->i_hole_sem);
buf = iov->iov_base + written; /* handle partial DIO write */
do {
unsigned long index;
@@ -2000,6 +2002,7 @@ generic_file_buffered_write(struct kiocb
cond_resched();
} while (count);
*ppos = pos;
+ up_write(&inode->i_hole_sem);
if (cached_page)
page_cache_release(cached_page);
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH RFC] O_DIRECT reads and writes without i_sem
2004-11-01 15:32 [PATCH RFC] O_DIRECT reads and writes without i_sem Chris Mason
@ 2004-11-01 16:08 ` Christoph Hellwig
2004-11-01 16:56 ` Chris Mason
2004-11-05 18:06 ` Chris Mason
0 siblings, 2 replies; 4+ messages in thread
From: Christoph Hellwig @ 2004-11-01 16:08 UTC (permalink / raw)
To: Chris Mason; +Cc: linux-fsdevel, akpm
On Mon, Nov 01, 2004 at 10:32:07AM -0500, Chris Mason wrote:
> Hello everyone,
>
> Right now, O_DIRECT reads and writes on regular files have to take i_sem
> while reading file metadata in order to make sure we don't race with
> hole filling.
>
> This patch tries to get around that by avoiding i_sem when we are doing
> an O_DIRECT read or write inside of i_size. Yet another rw semaphore is
> added to struct inode to protect against holes being filled during the
> O_DIRECT. direct-io.c gets another special case to be aware of the
> locking.
>
> This has only been lightly tested, I'm posting here for general comments
> before I go much further. I'm rounding up some hardware with enough
> disks to benchmark it properly.
>
> Only ext2 and reiserfs are modified to drop i_sem during O_DIRECT. ext3
> needs some care around the orphan lists, and I didn't want to get into
> that until the rest of the patch was working.
This gets too complicated for it's own sake. What about going down the
XFS route and making i_sem a r/w semaphore that's taken only shared
during read and write I/O, but exclusive while setting up write I/O
outside of i_size? Alternatively just move the I/O locking into the
filesytem.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH RFC] O_DIRECT reads and writes without i_sem
2004-11-01 16:08 ` Christoph Hellwig
@ 2004-11-01 16:56 ` Chris Mason
2004-11-05 18:06 ` Chris Mason
1 sibling, 0 replies; 4+ messages in thread
From: Chris Mason @ 2004-11-01 16:56 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, akpm
On Mon, 2004-11-01 at 16:08 +0000, Christoph Hellwig wrote:
> On Mon, Nov 01, 2004 at 10:32:07AM -0500, Chris Mason wrote:
> > Right now, O_DIRECT reads and writes on regular files have to take i_sem
> > while reading file metadata in order to make sure we don't race with
> > hole filling.
[ ... ]
> This gets too complicated for it's own sake. What about going down the
> XFS route and making i_sem a r/w semaphore that's taken only shared
> during read and write I/O, but exclusive while setting up write I/O
> outside of i_size? Alternatively just move the I/O locking into the
> filesytem.
>
Nod, it is too complex, this is why I posted early ;) The only place in
the FS specific code for the locking I added to fs/direct-io.c would be
in each filesystem get_blocks call. I went for direct-io.c because
that's where all the other locking already was. shrug.
If we do down_read(i_rw_sem) for all cases except growing the file, then
we still have no locking for O_DIRECT while we are filling holes in the
file. The filesystem write function could do this:
down_read(i_rw_sem)
read fs metadata
if (filling hole) {
up_read(i_rw_sem)
down_write(i_rw_sem)
goto retry
}
But it also gets nasty in a hurry. The second alternative is just to
make O_DIRECT lock pages (or stub pages if they aren't in cache) in the
filesystem address space, but this might add significant overhead in
radix tree operations during O_DIRECT.
-chris
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH RFC] O_DIRECT reads and writes without i_sem
2004-11-01 16:08 ` Christoph Hellwig
2004-11-01 16:56 ` Chris Mason
@ 2004-11-05 18:06 ` Chris Mason
1 sibling, 0 replies; 4+ messages in thread
From: Chris Mason @ 2004-11-05 18:06 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, akpm
On Mon, 2004-11-01 at 16:08 +0000, Christoph Hellwig wrote:
> On Mon, Nov 01, 2004 at 10:32:07AM -0500, Chris Mason wrote:
> > Hello everyone,
> >
> > Right now, O_DIRECT reads and writes on regular files have to take i_sem
> > while reading file metadata in order to make sure we don't race with
> > hole filling.
> >
> > This patch tries to get around that by avoiding i_sem when we are doing
> > an O_DIRECT read or write inside of i_size. Yet another rw semaphore is
> > added to struct inode to protect against holes being filled during the
> > O_DIRECT. direct-io.c gets another special case to be aware of the
> > locking.
> >
> > This has only been lightly tested, I'm posting here for general comments
> > before I go much further. I'm rounding up some hardware with enough
> > disks to benchmark it properly.
Looks like there's a long line to get to the big test machines here, so
I did some more local performance tests. I've got two scsi drives
striped into a lvm2 target (64k stripe size). FS is reiser. I did
testing on writes because more of the io is done under i_sem for writes,
so it should be easiest to show the benefit there.
The test is a 16k random write test on a 15GB file. 50MB of io is done
by each random writer, and everyone is writing to the same preallocated
file.
1 random writer = .64MB/s
2 random writers (unpatched) = .71MB/s
2 random writers (patched) = 1.02MB/s
In the patched kernel, the two writer procs finished at roughly the same
time (< 1sec difference). In the unpatched kernel, writer #1 finished
in about 1/2 the time as writer #2.
-chris
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2004-11-05 18:07 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-11-01 15:32 [PATCH RFC] O_DIRECT reads and writes without i_sem Chris Mason
2004-11-01 16:08 ` Christoph Hellwig
2004-11-01 16:56 ` Chris Mason
2004-11-05 18:06 ` Chris Mason
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).