All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
@ 2002-07-28  7:33 Andrew Morton
  2002-07-28 11:06 ` Christoph Hellwig
  2002-07-29  0:04 ` Linus Torvalds
  0 siblings, 2 replies; 8+ messages in thread
From: Andrew Morton @ 2002-07-28  7:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml



We're moving in the direction of deprecating the raw driver and
recommending that applications use O_DIRECT reads and writes against
blockdevs.

I don't know how acceptable this will be - there may be
operational/admin reasons for preferring the raw driver, and there are
certainly application porting issues.  So we should continue to support
the raw driver well in 2.6.

One weakness which writes to blockdevs have wrt the raw driver is that
they are serialised under i_sem.  There is no need for this.

This patch changes O_DIRECT writes to blockdevs so that they no longer
run under i_sem.



 filemap.c |   71 ++++++++++++++++++++++++++++++++++++++------------------------
 1 files changed, 44 insertions(+), 27 deletions(-)

--- 2.5.29/mm/filemap.c~o_direct-i_sem	Sat Jul 27 23:39:13 2002
+++ 2.5.29-akpm/mm/filemap.c	Sat Jul 27 23:48:58 2002
@@ -1934,6 +1934,7 @@ generic_file_write(struct file *file, co
 	struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	int		isblk = S_ISBLK(inode->i_mode);
 	long		status = 0;
 	loff_t		pos;
 	struct page	*page;
@@ -1953,19 +1954,19 @@ generic_file_write(struct file *file, co
 	pos = *ppos;
 	if (unlikely(pos < 0)) {
 		err = -EINVAL;
-		goto out;
+		goto out_sem;
 	}
 
 	if (unlikely(file->f_error)) {
 		err = file->f_error;
 		file->f_error = 0;
-		goto out;
+		goto out_sem;
 	}
 
 	written = 0;
 
 	/* FIXME: this is for backwards compatibility with 2.4 */
-	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+	if (!isblk && file->f_flags & O_APPEND)
 		pos = inode->i_size;
 
 	/*
@@ -1975,7 +1976,7 @@ generic_file_write(struct file *file, co
 		if (pos >= limit) {
 			send_sig(SIGXFSZ, current, 0);
 			err = -EFBIG;
-			goto out;
+			goto out_sem;
 		}
 		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
 			/* send_sig(SIGXFSZ, current, 0); */
@@ -1991,7 +1992,7 @@ generic_file_write(struct file *file, co
 		if (pos >= MAX_NON_LFS) {
 			send_sig(SIGXFSZ, current, 0);
 			err = -EFBIG;
-			goto out;
+			goto out_sem;
 		}
 		if (count > MAX_NON_LFS - (u32)pos) {
 			/* send_sig(SIGXFSZ, current, 0); */
@@ -2006,12 +2007,12 @@ generic_file_write(struct file *file, co
 	 * exceeded without writing data we send a signal and return EFBIG.
 	 * Linus frestrict idea will clean these up nicely..
 	 */
-	if (likely(!S_ISBLK(inode->i_mode))) {
+	if (likely(!isblk)) {
 		if (unlikely(pos >= inode->i_sb->s_maxbytes)) {
 			if (count || pos > inode->i_sb->s_maxbytes) {
 				send_sig(SIGXFSZ, current, 0);
 				err = -EFBIG;
-				goto out;
+				goto out_sem;
 			}
 			/* zero-length writes at ->s_maxbytes are OK */
 		}
@@ -2021,12 +2022,12 @@ generic_file_write(struct file *file, co
 	} else {
 		if (bdev_read_only(inode->i_bdev)) {
 			err = -EPERM;
-			goto out;
+			goto out_sem;
 		}
 		if (pos >= inode->i_size) {
 			if (count || pos > inode->i_size) {
 				err = -ENOSPC;
-				goto out;
+				goto out_sem;
 			}
 		}
 
@@ -2036,7 +2037,7 @@ generic_file_write(struct file *file, co
 
 	err = 0;
 	if (count == 0)
-		goto out;
+		goto out_sem;
 
 	remove_suid(file->f_dentry);
 	time_now = CURRENT_TIME;
@@ -2047,25 +2048,40 @@ generic_file_write(struct file *file, co
 	}
 
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_IO(WRITE, inode,
+		if (isblk) {
+			up(&inode->i_sem);
+			written = generic_file_direct_IO(WRITE, inode,
 						(char *)buf, pos, count);
-		if (written > 0) {
-			loff_t end = pos + written;
-			if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
-				inode->i_size = end;
-				mark_inode_dirty(inode);
+			if (written > 0) {
+				if (mapping->nrpages)
+					invalidate_inode_pages2(mapping);
+				*ppos = pos + written;
 			}
-			*ppos = end;
-			if (mapping->nrpages)
-				invalidate_inode_pages2(mapping);
+			err = written;
+			goto out;
+		} else {
+			written = generic_file_direct_IO(WRITE, inode,
+						(char *)buf, pos, count);
+			if (written > 0) {
+				loff_t end = pos + written;
+				if (end > inode->i_size) {
+					inode->i_size = end;
+					mark_inode_dirty(inode);
+				}
+				*ppos = end;
+				/*
+				 * Sync the fs metadata but not the minor inode
+				 * changes and of course not the data as we did
+				 * direct DMA for the IO.
+				 */
+				if (file->f_flags & O_SYNC)
+					status = generic_osync_inode(inode,
+								OSYNC_METADATA);
+				if (mapping->nrpages)
+					invalidate_inode_pages2(mapping);
+			}
+			goto out_status;
 		}
-		/*
-		 * Sync the fs metadata but not the minor inode changes and
-		 * of course not the data as we did direct DMA for the IO.
-		 */
-		if (written >= 0 && file->f_flags & O_SYNC)
-			status = generic_osync_inode(inode, OSYNC_METADATA);
-		goto out_status;
 	}
 
 	do {
@@ -2152,7 +2168,8 @@ generic_file_write(struct file *file, co
 	
 out_status:	
 	err = written ? written : status;
-out:
+out_sem:
 	up(&inode->i_sem);
+out:
 	return err;
 }

.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-28  7:33 [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs Andrew Morton
@ 2002-07-28 11:06 ` Christoph Hellwig
  2002-07-28 17:55   ` Andrew Morton
  2002-07-29  0:04 ` Linus Torvalds
  1 sibling, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2002-07-28 11:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, lkml

On Sun, Jul 28, 2002 at 12:33:23AM -0700, Andrew Morton wrote:
> This patch changes O_DIRECT writes to blockdevs so that they no longer
> run under i_sem.

Please don't make this depenend on S_ISBLK().  There are filesystems (like
XFS) that are designed to safely allow concurrent O_DIRECT writes to
regular files.

Toe implement this properly we should drop i_sem in the ->direct_IO method
of the filesystem/blockdevice.  The only question remaining is whether the
method has to reqacquire it before returing (and it'll be imediately
released again or whether we should change semantics of ->direct_IO to
always drop the lock.

The third options would be to never call ->direct_IO with the i_sem held
and let filesystems that need it (only ext2 in 2.5 mainline) do
synchronization themselves.

I think I prefer option 3, it's the cleanest way of doing it.

A little unrelated, but as you touch the code:  what about removing the two
existing special cases for S_ISBLK() in generic_file_write()?  they're
present only to provide the old (pre-LFS) blockdevice semantics on 2.4,
we shouldn't keept them around forever..


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-28 11:06 ` Christoph Hellwig
@ 2002-07-28 17:55   ` Andrew Morton
  2002-07-28 18:05     ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2002-07-28 17:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: lkml

Christoph Hellwig wrote:
> 
> On Sun, Jul 28, 2002 at 12:33:23AM -0700, Andrew Morton wrote:
> > This patch changes O_DIRECT writes to blockdevs so that they no longer
> > run under i_sem.
> 
> Please don't make this depenend on S_ISBLK().  There are filesystems (like
> XFS) that are designed to safely allow concurrent O_DIRECT writes to
> regular files.
> 
> Toe implement this properly we should drop i_sem in the ->direct_IO method
> of the filesystem/blockdevice.  The only question remaining is whether the
> method has to reqacquire it before returing (and it'll be imediately
> released again or whether we should change semantics of ->direct_IO to
> always drop the lock.
> 
> The third options would be to never call ->direct_IO with the i_sem held
> and let filesystems that need it (only ext2 in 2.5 mainline) do
> synchronization themselves.
> 
> I think I prefer option 3, it's the cleanest way of doing it.

It could be time to separate out a __generic_file_write() which
doesn't take i_sem at all.  The ext3 tree was doing that for a
while, to permit multipage transactions in journalled data mode.

> A little unrelated, but as you touch the code:  what about removing the two
> existing special cases for S_ISBLK() in generic_file_write()?  they're
> present only to provide the old (pre-LFS) blockdevice semantics on 2.4,
> we shouldn't keept them around forever..

hm.  Are you sure about that?  They look fairly useful to me?

-

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-28 17:55   ` Andrew Morton
@ 2002-07-28 18:05     ` Christoph Hellwig
  2002-07-28 18:41       ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2002-07-28 18:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml

On Sun, Jul 28, 2002 at 10:55:56AM -0700, Andrew Morton wrote:
> > I think I prefer option 3, it's the cleanest way of doing it.
> 
> It could be time to separate out a __generic_file_write() which
> doesn't take i_sem at all.  The ext3 tree was doing that for a
> while, to permit multipage transactions in journalled data mode.

In fact we we already have that already in the XFS tree (Steve called it
do_generic_file_write although I'd really prefer __generic_file_write).

> > A little unrelated, but as you touch the code:  what about removing the two
> > existing special cases for S_ISBLK() in generic_file_write()?  they're
> > present only to provide the old (pre-LFS) blockdevice semantics on 2.4,
> > we shouldn't keept them around forever..
> 
> hm.  Are you sure about that?  They look fairly useful to me?

The O_APPEND special casing is certainly very, very ugly - application
should use it on block devices at all - if they're screwed when doing it
anyway it's their problem.

And I think we can expect reasonable ulimits for root nowdays, although
I'm open for discussions on that one.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-28 18:05     ` Christoph Hellwig
@ 2002-07-28 18:41       ` Christoph Hellwig
  0 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2002-07-28 18:41 UTC (permalink / raw)
  To: Andrew Morton, lkml

On Sun, Jul 28, 2002 at 07:05:44PM +0100, Christoph Hellwig wrote:
> And I think we can expect reasonable ulimits for root nowdays, although
> I'm open for discussions on that one.

Forget about this one - I remembered the code wrongly.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-28  7:33 [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs Andrew Morton
  2002-07-28 11:06 ` Christoph Hellwig
@ 2002-07-29  0:04 ` Linus Torvalds
  2002-07-29  0:39   ` Andrew Morton
  1 sibling, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2002-07-29  0:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml



On Sun, 28 Jul 2002, Andrew Morton wrote:
>
> We're moving in the direction of deprecating the raw driver and
> recommending that applications use O_DIRECT reads and writes against
> blockdevs.

This should probably be done unconditionally or not at all.

We've worked very hard on making block devices more "normal" in 2.5.x, and
I don't want to start diverging again.

If this is really a scalability issue, I would suggest that people who
care look into just getting rid of "i_sem", and replacing it with a
read-write semaphore that explicitly protects only "i_size". Then you make
reads and non-extending writes take that semaphore for reading, and
extending writes and truncates taking it for writing.

[ The "nonextending writes" case is somewhat interesting, a write probably
  needs to actually take the semaphore for writing, and then downgrading
  it to reading after it has checked that it doesn't end up extending the
  file.

  What makes this even more interesting is that depending on the semaphore
  implementation you can actually split up the "take write lock" into
  "prepare to take write lock" and "turn it into a read lock" or "confirm
  write lock", where the "prepare to take write lock" allows existing
  readers but not new write-lockers, so that if you downgrade to a read
  lock you never had to synchronize with anybody else who was already
  reading. ]

I'd much rather do this _right_ than have some ugly blockdev-only hack,
since the problem certainly would happen with files too. A lot of people
want to do databases on a filesystem, just because it is so much easier to
administer.

		Linus


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-29  0:04 ` Linus Torvalds
@ 2002-07-29  0:39   ` Andrew Morton
  2002-07-29  0:47     ` Linus Torvalds
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2002-07-29  0:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml

Linus Torvalds wrote:
> 
> On Sun, 28 Jul 2002, Andrew Morton wrote:
> >
> > We're moving in the direction of deprecating the raw driver and
> > recommending that applications use O_DIRECT reads and writes against
> > blockdevs.
> 
> This should probably be done unconditionally or not at all.
> 
> We've worked very hard on making block devices more "normal" in 2.5.x, and
> I don't want to start diverging again.
> 
> If this is really a scalability issue, I would suggest that people who
> care look into just getting rid of "i_sem", and replacing it with a
> read-write semaphore that explicitly protects only "i_size". Then you make
> reads and non-extending writes take that semaphore for reading, and
> extending writes and truncates taking it for writing.

I don't know if it is a scalability issue, frankly.  It will be for
buffered writes, but for writes which wait on IO, the mechanics of
the media probably make the benefits small.  Conceivably there are
some additional merging opportunities, but it's thin.

We can do the rwsem thing, and that would be good.  But there may
be filesystems which are relying on i_sem to provide protection
against concurrent invokations of get_block(create=1), inside i_size.

> [ The "nonextending writes" case is somewhat interesting, a write probably
>   needs to actually take the semaphore for writing, and then downgrading
>   it to reading after it has checked that it doesn't end up extending the
>   file.
> 
>   What makes this even more interesting is that depending on the semaphore
>   implementation you can actually split up the "take write lock" into
>   "prepare to take write lock" and "turn it into a read lock" or "confirm
>   write lock", where the "prepare to take write lock" allows existing
>   readers but not new write-lockers, so that if you downgrade to a read
>   lock you never had to synchronize with anybody else who was already
>   reading. ]
> 
> I'd much rather do this _right_ than have some ugly blockdev-only hack,
> since the problem certainly would happen with files too. A lot of people
> want to do databases on a filesystem, just because it is so much easier to
> administer.

OK. It'd be nice to get some benchmarks first (say, between O_DIRECT-to-blockdev
and the raw driver) to see if it's worth bothering with.

-

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs
  2002-07-29  0:39   ` Andrew Morton
@ 2002-07-29  0:47     ` Linus Torvalds
  0 siblings, 0 replies; 8+ messages in thread
From: Linus Torvalds @ 2002-07-29  0:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml



On Sun, 28 Jul 2002, Andrew Morton wrote:
>
> We can do the rwsem thing, and that would be good.  But there may
> be filesystems which are relying on i_sem to provide protection
> against concurrent invokations of get_block(create=1), inside i_size.

We actually want to retain i_sem for directory operations anyway (ie the
rw-semaphore would be an addition, not a replacement), so the easiest
transition would probably be to move the i_sem thing into the filesystems
when the rwsem thing is done (the same way the BKL removal worked), and
then let the filesystems make their own decisions on when they need it
(and the decision might well be to take it in the "create = 1" case, which
is likely to be fairly rare for non-extending writes)

The other alternative is to move this _all_ into the filesystem entirely,
and not have "generic_file_write()" take any lock at all. Let the
filesystem first take whatever lock it thinks it needs, and then call
"generic_file_write()". Filesystems migh choose to just get i_sem
unconditionally both for extending and non-extending writes.

That would mean that the filesystems would have to always wrap their use
of "generic_file_write()", but a number of them do so anyway because they
want to do some other book-keeping. I dunno.

			Linus


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-07-29  0:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-28  7:33 [patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs Andrew Morton
2002-07-28 11:06 ` Christoph Hellwig
2002-07-28 17:55   ` Andrew Morton
2002-07-28 18:05     ` Christoph Hellwig
2002-07-28 18:41       ` Christoph Hellwig
2002-07-29  0:04 ` Linus Torvalds
2002-07-29  0:39   ` Andrew Morton
2002-07-29  0:47     ` Linus Torvalds

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.