All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
@ 2002-07-28  7:33 Andrew Morton
  2002-07-28 11:06 ` Christoph Hellwig
  2002-07-29  0:04 ` Linus Torvalds
  0 siblings, 2 replies; 8+ messages in thread
From: Andrew Morton @ 2002-07-28  7:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml



We're moving in the direction of deprecating the raw driver and
recommending that applications use O_DIRECT reads and writes against
blockdevs.

I don't know how acceptable this will be - there may be
operational/admin reasons for preferring the raw driver, and there are
certainly application porting issues.  So we should continue to support
the raw driver well in 2.6.

One weakness which writes to blockdevs have wrt the raw driver is that
they are serialised under i_sem.  There is no need for this.

This patch changes O_DIRECT writes to blockdevs so that they no longer
run under i_sem.



 filemap.c |   71 ++++++++++++++++++++++++++++++++++++++------------------------
 1 files changed, 44 insertions(+), 27 deletions(-)

--- 2.5.29/mm/filemap.c~o_direct-i_sem	Sat Jul 27 23:39:13 2002
+++ 2.5.29-akpm/mm/filemap.c	Sat Jul 27 23:48:58 2002
@@ -1934,6 +1934,7 @@ generic_file_write(struct file *file, co
 	struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	int		isblk = S_ISBLK(inode->i_mode);
 	long		status = 0;
 	loff_t		pos;
 	struct page	*page;
@@ -1953,19 +1954,19 @@ generic_file_write(struct file *file, co
 	pos = *ppos;
 	if (unlikely(pos < 0)) {
 		err = -EINVAL;
-		goto out;
+		goto out_sem;
 	}
 
 	if (unlikely(file->f_error)) {
 		err = file->f_error;
 		file->f_error = 0;
-		goto out;
+		goto out_sem;
 	}
 
 	written = 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
-	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+	if (!isblk && file->f_flags & O_APPEND)
 		pos = inode->i_size;
 
 	/*
@@ -1975,7 +1976,7 @@ generic_file_write(struct file *file, co
 		if (pos >= limit) {
 			send_sig(SIGXFSZ, current, 0);
 			err = -EFBIG;
-			goto out;
+			goto out_sem;
 		}
 		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
 			/* send_sig(SIGXFSZ, current, 0); */
@@ -1991,7 +1992,7 @@ generic_file_write(struct file *file, co
 		if (pos >= MAX_NON_LFS) {
 			send_sig(SIGXFSZ, current, 0);
 			err = -EFBIG;
-			goto out;
+			goto out_sem;
 		}
 		if (count > MAX_NON_LFS - (u32)pos) {
 			/* send_sig(SIGXFSZ, current, 0); */
@@ -2006,12 +2007,12 @@ generic_file_write(struct file *file, co
 	 * exceeded without writing data we send a signal and return EFBIG.
 	 * Linus frestrict idea will clean these up nicely..
 	 */
-	if (likely(!S_ISBLK(inode->i_mode))) {
+	if (likely(!isblk)) {
 		if (unlikely(pos >= inode->i_sb->s_maxbytes)) {
 			if (count || pos > inode->i_sb->s_maxbytes) {
 				send_sig(SIGXFSZ, current, 0);
 				err = -EFBIG;
-				goto out;
+				goto out_sem;
 			}
 			/* zero-length writes at ->s_maxbytes are OK */
 		}
@@ -2021,12 +2022,12 @@ generic_file_write(struct file *file, co
 	} else {
 		if (bdev_read_only(inode->i_bdev)) {
 			err = -EPERM;
-			goto out;
+			goto out_sem;
 		}
 		if (pos >= inode->i_size) {
 			if (count || pos > inode->i_size) {
 				err = -ENOSPC;
-				goto out;
+				goto out_sem;
 			}
 		}
 
@@ -2036,7 +2037,7 @@ generic_file_write(struct file *file, co
 
 	err = 0;
 	if (count == 0)
-		goto out;
+		goto out_sem;
 
 	remove_suid(file->f_dentry);
 	time_now = CURRENT_TIME;
@@ -2047,25 +2048,40 @@ generic_file_write(struct file *file, co
 	}
 
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_IO(WRITE, inode,
+		if (isblk) {
+			up(&inode->i_sem);
+			written = generic_file_direct_IO(WRITE, inode,
 						(char *)buf, pos, count);
-		if (written > 0) {
-			loff_t end = pos + written;
-			if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
-				inode->i_size = end;
-				mark_inode_dirty(inode);
+			if (written > 0) {
+				if (mapping->nrpages)
+					invalidate_inode_pages2(mapping);
+				*ppos = pos + written;
 			}
-			*ppos = end;
-			if (mapping->nrpages)
-				invalidate_inode_pages2(mapping);
+			err = written;
+			goto out;
+		} else {
+			written = generic_file_direct_IO(WRITE, inode,
+						(char *)buf, pos, count);
+			if (written > 0) {
+				loff_t end = pos + written;
+				if (end > inode->i_size) {
+					inode->i_size = end;
+					mark_inode_dirty(inode);
+				}
+				*ppos = end;
+				/*
+				 * Sync the fs metadata but not the minor inode
+				 * changes and of course not the data as we did
+				 * direct DMA for the IO.
+				 */
+				if (file->f_flags & O_SYNC)
+					status = generic_osync_inode(inode,
+								OSYNC_METADATA);
+				if (mapping->nrpages)
+					invalidate_inode_pages2(mapping);
+			}
+			goto out_status;
 		}
-		/*
-		 * Sync the fs metadata but not the minor inode changes and
-		 * of course not the data as we did direct DMA for the IO.
-		 */
-		if (written >= 0 && file->f_flags & O_SYNC)
-			status = generic_osync_inode(inode, OSYNC_METADATA);
-		goto out_status;
 	}
 
 	do {
@@ -2152,7 +2168,8 @@ generic_file_write(struct file *file, co
 	
 out_status:	
 	err = written ? written : status;
-out:
+out_sem:
 	up(&inode->i_sem);
+out:
 	return err;
 }

.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-07-29  0:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-28  7:33 [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs Andrew Morton
2002-07-28 11:06 ` Christoph Hellwig
2002-07-28 17:55   ` Andrew Morton
2002-07-28 18:05     ` Christoph Hellwig
2002-07-28 18:41       ` Christoph Hellwig
2002-07-29  0:04 ` Linus Torvalds
2002-07-29  0:39   ` Andrew Morton
2002-07-29  0:47     ` Linus Torvalds

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.