linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] O_DIRECT reads and writes without i_sem
@ 2004-11-01 15:32 Chris Mason
  2004-11-01 16:08 ` Christoph Hellwig
  0 siblings, 1 reply; 4+ messages in thread
From: Chris Mason @ 2004-11-01 15:32 UTC (permalink / raw)
  To: linux-fsdevel, akpm

Hello everyone,

Right now, O_DIRECT reads and writes on regular files have to take i_sem
while reading file metadata in order to make sure we don't race with
hole filling.

This patch tries to get around that by avoiding i_sem when we are doing
an O_DIRECT read or write inside of i_size.  Yet another rw semaphore is
added to struct inode to protect against holes being filled during the
O_DIRECT.  direct-io.c gets another special case to be aware of the
locking.

This has only been lightly tested, I'm posting here for general comments
before I go much further.  I'm rounding up some hardware with enough
disks to benchmark it properly.  

Only ext2 and reiserfs are modified to drop i_sem during O_DIRECT. ext3
needs some care around the orphan lists, and I didn't want to get into
that until the rest of the patch was working.

-chris

[patch against 2.6.10-rc1-mm2]

Index: linux.mm/fs/direct-io.c
===================================================================
--- linux.mm.orig/fs/direct-io.c	2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/direct-io.c	2004-11-01 09:34:40.000000000 -0500
@@ -915,7 +915,7 @@ out:
 }
 
 /*
- * Releases both i_sem and i_alloc_sem
+ * Releases i_sem, i_alloc_sem and i_hole_sem
  */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
@@ -1065,6 +1065,8 @@ direct_io_worker(int rw, struct kiocb *i
 	 */
 	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
 		up(&dio->inode->i_sem);
+	if ((dio->lock_type == DIO_HOLE_LOCKING))
+		up_read(&dio->inode->i_hole_sem);
 
 	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1199,6 +1201,9 @@ __blockdev_direct_IO(int rw, struct kioc
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	both readers and writers need to grab i_alloc_sem
 	 *	neither readers nor writers hold i_sem on entry (nor exit)
+	 * For regular files using DIO_OWN_LOCKING,
+	 *	same as DIO_OWN_LOCKING, but both readers/writers read lock
+	 *	i_hole_sem until after get_blocks has finished.
 	 */
 	dio->lock_type = dio_lock_type;
 	if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1214,11 +1219,14 @@ __blockdev_direct_IO(int rw, struct kioc
 				goto out;
 			}
 			down_read(&inode->i_alloc_sem);
-			if (dio_lock_type == DIO_OWN_LOCKING)
+			if (dio_lock_type == DIO_OWN_LOCKING ||
+			    dio_lock_type == DIO_HOLE_LOCKING)
 				up(&inode->i_sem);
 		} else {
 			down_read(&inode->i_alloc_sem);
 		}
+		if (dio_lock_type == DIO_HOLE_LOCKING)
+			down_read(&inode->i_hole_sem);
 	}
 	/*
 	 * For file extending writes updating i_size before data
Index: linux.mm/fs/ext2/inode.c
===================================================================
--- linux.mm.orig/fs/ext2/inode.c	2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/ext2/inode.c	2004-11-01 09:34:40.000000000 -0500
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "acl.h"
 
@@ -649,9 +650,26 @@ ext2_direct_IO(int rw, struct kiocb *ioc
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, ext2_get_blocks, NULL);
+	ssize_t ret;
+	loff_t count = iov_length(iov, nr_segs);
+	int locking = DIO_HOLE_LOCKING;
+
+	/* 
+	 * reads always use DIO_HOLE_LOCKING.  writes use DIO_HOLE_LOCKING
+	 * unless they are extending the file
+	 */
+	if (rw == WRITE) {
+		if (offset + count > i_size_read(inode))
+			locking = DIO_LOCKING;
+		else
+			up(&inode->i_sem);
+	}
+	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				   offset, nr_segs, ext2_get_blocks, NULL, 
+				   locking);
+	if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+		down(&inode->i_sem);
+	return ret;
 }
 
 static int
Index: linux.mm/fs/inode.c
===================================================================
--- linux.mm.orig/fs/inode.c	2004-11-01 09:34:24.000000000 -0500
+++ linux.mm/fs/inode.c	2004-11-01 09:34:40.000000000 -0500
@@ -197,6 +197,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
 	init_rwsem(&inode->i_alloc_sem);
+	init_rwsem(&inode->i_hole_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
Index: linux.mm/fs/reiserfs/file.c
===================================================================
--- linux.mm.orig/fs/reiserfs/file.c	2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/file.c	2004-11-01 09:34:40.000000000 -0500
@@ -1249,6 +1249,7 @@ ssize_t reiserfs_file_write( struct file
     if (res)
 	goto out;
 
+    down_write(&inode->i_hole_sem);
     while ( count > 0) {
 	/* This is the main loop in which we running until some error occures
 	   or until we write all of the data. */
@@ -1351,6 +1352,7 @@ ssize_t reiserfs_file_write( struct file
 	count -= write_bytes;
 	balance_dirty_pages_ratelimited(inode->i_mapping);
     }
+    up_write(&inode->i_hole_sem);
 
     /* this is only true on error */
     if (th.t_trans_id) {
Index: linux.mm/fs/reiserfs/inode.c
===================================================================
--- linux.mm.orig/fs/reiserfs/inode.c	2004-11-01 09:34:05.000000000 -0500
+++ linux.mm/fs/reiserfs/inode.c	2004-11-01 09:34:40.000000000 -0500
@@ -17,6 +17,7 @@
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
+#include <linux/uio.h>
 
 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
 
@@ -2741,9 +2742,26 @@ static ssize_t reiserfs_direct_IO(int rw
 {
     struct file *file = iocb->ki_filp;
     struct inode *inode = file->f_mapping->host;
-
-    return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-			offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+    ssize_t ret;
+    loff_t count = iov_length(iov, nr_segs);
+    int locking = DIO_HOLE_LOCKING;
+
+    /* 
+     * reads always use DIO_HOLE_LOCKING.  writes use DIO_HOLE_LOCKING
+     * unless they are extending the file
+     */
+    if (rw == WRITE) {
+    	if (offset + count > i_size_read(inode))
+	    locking = DIO_LOCKING;
+        else
+	    up(&inode->i_sem);
+    }
+    ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 
+    			       iov, offset, nr_segs, 
+			       reiserfs_get_blocks_direct_io, NULL, locking);
+    if (rw == WRITE && locking == DIO_HOLE_LOCKING)
+        down(&inode->i_sem);
+    return ret;
 }
 
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
Index: linux.mm/include/linux/fs.h
===================================================================
--- linux.mm.orig/include/linux/fs.h	2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/include/linux/fs.h	2004-11-01 09:34:40.000000000 -0500
@@ -455,6 +455,7 @@ struct inode {
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct semaphore	i_sem;
 	struct rw_semaphore	i_alloc_sem;
+	struct rw_semaphore	i_hole_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -1575,6 +1576,7 @@ enum {
 	DIO_LOCKING = 1, /* need locking between buffered and direct access */
 	DIO_NO_LOCKING,  /* bdev; no locking at all between buffered/direct */
 	DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+	DIO_HOLE_LOCKING, /* use i_hole_sem to lock against hole fillers */
 };
 
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
Index: linux.mm/mm/filemap.c
===================================================================
--- linux.mm.orig/mm/filemap.c	2004-11-01 09:34:26.000000000 -0500
+++ linux.mm/mm/filemap.c	2004-11-01 09:38:29.000000000 -0500
@@ -78,6 +78,7 @@
  *
  *  ->i_sem
  *    ->i_alloc_sem             (various)
+ *	->i_hole_sem		(various)
  *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
@@ -1927,6 +1928,7 @@ generic_file_buffered_write(struct kiocb
 
 	pagevec_init(&lru_pvec, 0);
 
+	down_write(&inode->i_hole_sem);
 	buf = iov->iov_base + written;	/* handle partial DIO write */
 	do {
 		unsigned long index;
@@ -2000,6 +2002,7 @@ generic_file_buffered_write(struct kiocb
 		cond_resched();
 	} while (count);
 	*ppos = pos;
+	up_write(&inode->i_hole_sem);
 
 	if (cached_page)
 		page_cache_release(cached_page);



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2004-11-05 18:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-11-01 15:32 [PATCH RFC] O_DIRECT reads and writes without i_sem Chris Mason
2004-11-01 16:08 ` Christoph Hellwig
2004-11-01 16:56   ` Chris Mason
2004-11-05 18:06   ` Chris Mason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).