From: Chris Mason <mason@suse.com>
To: linux-fsdevel@vger.kernel.org, akpm@osdl.org
Subject: [PATCH RFC] O_DIRECT reads and writes without i_sem
Date: Mon, 01 Nov 2004 10:32:07 -0500 [thread overview]
Message-ID: <1099323127.23475.80.camel@watt.suse.com> (raw)
Hello everyone,
Right now, O_DIRECT reads and writes on regular files have to take i_sem
while reading file metadata in order to make sure we don't race with
hole filling.
This patch tries to get around that by avoiding i_sem when we are doing
an O_DIRECT read or write inside of i_size. Yet another rw semaphore is
added to struct inode to protect against holes being filled during the
O_DIRECT. direct-io.c gets another special case to be aware of the
locking.
This has only been lightly tested, I'm posting here for general comments
before I go much further. I'm rounding up some hardware with enough
disks to benchmark it properly.
Only ext2 and reiserfs are modified to drop i_sem during O_DIRECT. ext3
needs some care around the orphan lists, and I didn't want to get into
that until the rest of the patch was working.
-chris
[patch against 2.6.10-rc1-mm2]
Index: linux.mm/fs/direct-io.c
===================================================================
--- linux.mm.orig/fs/direct-io.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/direct-io.c 2004-11-01 09:34:40.000000000 -0500
@@ -915,7 +915,7 @@ out:
}
/*
- * Releases both i_sem and i_alloc_sem
+ * Releases i_sem, i_alloc_sem and i_hole_sem
*/
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1065,6 +1065,8 @@ direct_io_worker(int rw, struct kiocb *i
*/
if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
up(&dio->inode->i_sem);
+ if ((dio->lock_type == DIO_HOLE_LOCKING))
+ up_read(&dio->inode->i_hole_sem);
/*
* OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1199,6 +1201,9 @@ __blockdev_direct_IO(int rw, struct kioc
* For regular files using DIO_OWN_LOCKING,
* both readers and writers need to grab i_alloc_sem
* neither readers nor writers hold i_sem on entry (nor exit)
+ * For regular files using DIO_OWN_LOCKING,
+ * same as DIO_OWN_LOCKING, but both readers/writers read lock
+ * i_hole_sem until after get_blocks has finished.
*/
dio->lock_type = dio_lock_type;
if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1214,11 +1219,14 @@ __blockdev_direct_IO(int rw, struct kioc
goto out;
}
down_read(&inode->i_alloc_sem);
- if (dio_lock_type == DIO_OWN_LOCKING)
+ if (dio_lock_type == DIO_OWN_LOCKING ||
+ dio_lock_type == DIO_HOLE_LOCKING)
up(&inode->i_sem);
} else {
down_read(&inode->i_alloc_sem);
}
+ if (dio_lock_type == DIO_HOLE_LOCKING)
+ down_read(&inode->i_hole_sem);
}
/*
* For file extending writes updating i_size before data
Index: linux.mm/fs/ext2/inode.c
===================================================================
--- linux.mm.orig/fs/ext2/inode.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/ext2/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "acl.h"
@@ -649,9 +650,26 @@ ext2_direct_IO(int rw, struct kiocb *ioc
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, ext2_get_blocks, NULL);
+ ssize_t ret;
+ loff_t count = iov_length(iov, nr_segs);
+ int locking = DIO_HOLE_LOCKING;
+
+ /*
+ * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING
+ * unless they are extending the file
+ */
+ if (rw == WRITE) {
+ if (offset + count > i_size_read(inode))
+ locking = DIO_LOCKING;
+ else
+ up(&inode->i_sem);
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs, ext2_get_blocks, NULL,
+ locking);
+ if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+ down(&inode->i_sem);
+ return ret;
}
static int
Index: linux.mm/fs/inode.c
===================================================================
--- linux.mm.orig/fs/inode.c 2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -197,6 +197,7 @@ void inode_init_once(struct inode *inode
INIT_LIST_HEAD(&inode->i_devices);
sema_init(&inode->i_sem, 1);
init_rwsem(&inode->i_alloc_sem);
+ init_rwsem(&inode->i_hole_sem);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
rwlock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
Index: linux.mm/fs/reiserfs/file.c
===================================================================
--- linux.mm.orig/fs/reiserfs/file.c 2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/file.c 2004-11-01 09:34:40.000000000 -0500
@@ -1249,6 +1249,7 @@ ssize_t reiserfs_file_write( struct file
if (res)
goto out;
+ down_write(&inode->i_hole_sem);
while ( count > 0) {
/* This is the main loop in which we running until some error occures
or until we write all of the data. */
@@ -1351,6 +1352,7 @@ ssize_t reiserfs_file_write( struct file
count -= write_bytes;
balance_dirty_pages_ratelimited(inode->i_mapping);
}
+ up_write(&inode->i_hole_sem);
/* this is only true on error */
if (th.t_trans_id) {
Index: linux.mm/fs/reiserfs/inode.c
===================================================================
--- linux.mm.orig/fs/reiserfs/inode.c 2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/inode.c 2004-11-01 09:34:40.000000000 -0500
@@ -17,6 +17,7 @@
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/quotaops.h>
+#include <linux/uio.h>
extern int reiserfs_default_io_size; /* default io size devuned in super.c */
@@ -2741,9 +2742,26 @@ static ssize_t reiserfs_direct_IO(int rw
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+ ssize_t ret;
+ loff_t count = iov_length(iov, nr_segs);
+ int locking = DIO_HOLE_LOCKING;
+
+ /*
+ * reads always use DIO_HOLE_LOCKING. writes use DIO_HOLE_LOCKING
+ * unless they are extending the file
+ */
+ if (rw == WRITE) {
+ if (offset + count > i_size_read(inode))
+ locking = DIO_LOCKING;
+ else
+ up(&inode->i_sem);
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iov, offset, nr_segs,
+ reiserfs_get_blocks_direct_io, NULL, locking);
+ if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+ down(&inode->i_sem);
+ return ret;
}
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
Index: linux.mm/include/linux/fs.h
===================================================================
--- linux.mm.orig/include/linux/fs.h 2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/include/linux/fs.h 2004-11-01 09:34:40.000000000 -0500
@@ -455,6 +455,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
struct semaphore i_sem;
struct rw_semaphore i_alloc_sem;
+ struct rw_semaphore i_hole_sem;
struct inode_operations *i_op;
struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb;
@@ -1575,6 +1576,7 @@ enum {
DIO_LOCKING = 1, /* need locking between buffered and direct access */
DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+ DIO_HOLE_LOCKING, /* use i_hole_sem to lock against hole fillers */
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
Index: linux.mm/mm/filemap.c
===================================================================
--- linux.mm.orig/mm/filemap.c 2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/mm/filemap.c 2004-11-01 09:38:29.000000000 -0500
@@ -78,6 +78,7 @@
*
* ->i_sem
* ->i_alloc_sem (various)
+ * ->i_hole_sem (various)
*
* ->inode_lock
* ->sb_lock (fs/fs-writeback.c)
@@ -1927,6 +1928,7 @@ generic_file_buffered_write(struct kiocb
pagevec_init(&lru_pvec, 0);
+ down_write(&inode->i_hole_sem);
buf = iov->iov_base + written; /* handle partial DIO write */
do {
unsigned long index;
@@ -2000,6 +2002,7 @@ generic_file_buffered_write(struct kiocb
cond_resched();
} while (count);
*ppos = pos;
+ up_write(&inode->i_hole_sem);
if (cached_page)
page_cache_release(cached_page);
next reply other threads:[~2004-11-01 15:34 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-11-01 15:32 Chris Mason [this message]
2004-11-01 16:08 ` [PATCH RFC] O_DIRECT reads and writes without i_sem Christoph Hellwig
2004-11-01 16:56 ` Chris Mason
2004-11-05 18:06 ` Chris Mason
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1099323127.23475.80.camel@watt.suse.com \
--to=mason@suse.com \
--cc=akpm@osdl.org \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.