All of lore.kernel.org
 help / color / mirror / Atom feed
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
To: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Cc: linux-kernel@vger.kernel.org
Subject: race between __sync_single_inode() and iput()/bdev_clear_inode()
Date: Sun, 20 Mar 2005 04:46:25 +0900	[thread overview]
Message-ID: <87zmwzzaem.fsf@devron.myhome.or.jp> (raw)

Hi,

EIP is at filemap_fdatawait+0xe/0x80
eax: e7461ad8   ebx: 00000000   ecx: 00000001   edx: 00000000
esi: e5334c40   edi: 6b6b6b6b   ebp: de239e88   esp: de239e70
ds: 007b   es: 007b   ss: 0068
Process fsstress (pid: 31048, threadinfo=de239000 task=e60cd020)
Stack: e7d34358 de239e88 c01834f7 00000000 e5334c40 e7461ad8 de239eb0 c01836bf 
       e7461ad8 00000001 00000000 00000001 e7f0eac8 e5334c40 e7f0eac8 e7d2ddf4 
       de239f0c c0183777 e5334c40 de239f4c e5334c40 e7f0eac8 e7d2ddf4 de239f0c 
Call Trace:
 [show_stack+127/160] show_stack+0x7f/0xa0
 [show_registers+351/464] show_registers+0x15f/0x1d0
 [die+244/384] die+0xf4/0x180
 [do_page_fault+886/1749] do_page_fault+0x376/0x6d5
 [error_code+43/48] error_code+0x2b/0x30
 [__sync_single_inode+447/512] __sync_single_inode+0x1bf/0x200
 [__writeback_single_inode+119/352] __writeback_single_inode+0x77/0x160
 [sync_sb_inodes+452/736] sync_sb_inodes+0x1c4/0x2e0
 [sync_inodes_sb+126/144] sync_inodes_sb+0x7e/0x90
 [sync_inodes+140/176] sync_inodes+0x8c/0xb0
 [do_sync+90/144] do_sync+0x5a/0x90
 [sys_sync+18/32] sys_sync+0x12/0x20
 [syscall_call+7/11] syscall_call+0x7/0xb
Code: ab 89 1c 24 8b 75 e4 8b 45 ec 89 74 24 08 89 44 24 04 e8 16 fd ff ff eb 93 8d 74 26 00 55 89 e5 57 56 53 83 ec 0c 8b 45 08 8b 38 <8b> 97 5c 01 00 00 8d b6 00 00 00 00 8d bf 00 00 00 00 89 d0 f0

Dump of assembler code for function filemap_fdatawait:
0xc013e290 <filemap_fdatawait+0>:  push   %ebp
0xc013e291 <filemap_fdatawait+1>:  mov    %esp,%ebp
0xc013e293 <filemap_fdatawait+3>:  push   %edi
0xc013e294 <filemap_fdatawait+4>:  push   %esi
0xc013e295 <filemap_fdatawait+5>:  push   %ebx
0xc013e296 <filemap_fdatawait+6>:  sub    $0xc,%esp
0xc013e299 <filemap_fdatawait+9>:  mov    0x8(%ebp),%eax     <- %eax is mapping
0xc013e29c <filemap_fdatawait+12>: mov    (%eax),%edi        <- %edi is mapping->host
0xc013e29e <filemap_fdatawait+14>: mov    0x15c(%edi),%edx   <- Oops here
0xc013e2a4 <filemap_fdatawait+20>: lea    0x0(%esi),%esi


I got the above Oops while doing fs stress test.

The cause of this was race condition between __sync_single_inode() and
iput()/bdev_clear_inode().

This race seems following condition.

          cpu0 (fs's inode)                 cpu1 (bdev's inode)
------------------------------------------------------------------------
                                               close("/dev/hda2")
                                       [...]
__sync_single_inode()
   /* copy the bdev's ->i_mapping */
   mapping = inode->i_mapping;

                                       generic_forget_inode()
                                          bdev_clear_inode()
					     /* restre the fs's ->i_mapping */
				             inode->i_mapping = &inode->i_data;
				          /* bdev's inode was freed */
                                          destroy_inode(inode);

   if (wait) {
      /* dereference a freed bdev's mapping->host */
      filemap_fdatawait(mapping);  /* Oops */


I wrote the attached patch for making sure fs's inode is not in
__sync_single_inode().

What do you think of this?

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>


Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---

 fs/block_dev.c    |   27 ++++++++++++++++++++++++++-
 fs/fs-writeback.c |   34 ++++++++++++++++++++--------------
 2 files changed, 46 insertions(+), 15 deletions(-)

diff -puN fs/block_dev.c~bdev-inode-sync fs/block_dev.c
--- linux-2.6.12-rc1/fs/block_dev.c~bdev-inode-sync	2005-03-20 02:59:24.000000000 +0900
+++ linux-2.6.12-rc1-hirofumi/fs/block_dev.c	2005-03-20 03:59:54.000000000 +0900
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 
 struct bdev_inode {
@@ -282,11 +283,35 @@ static inline void __bd_forget(struct in
 
 static void bdev_clear_inode(struct inode *inode)
 {
+	extern void wait_inode_ilock(struct inode *inode);
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
+	struct inode *i;
+
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
-		__bd_forget(list_entry(p, struct inode, i_devices));
+		inode = list_entry(p, struct inode, i_devices);
+		i = igrab(inode);
+		spin_unlock(&bdev_lock);
+		/*
+		 * Preparation for changeing the ->i_mapping.  Make
+		 * sure this inode is not in __sync_single_inode().
+		 */
+		if (i) {
+			spin_lock(&inode_lock);
+			wait_inode_ilock(i);
+			inode->i_state |= I_LOCK;
+			spin_unlock(&inode_lock);
+		}
+		spin_lock(&bdev_lock);
+		spin_lock(&inode_lock);
+		if (i) {
+			inode->i_state &= ~I_LOCK;
+			wake_up_inode(i);
+			iput(i);
+		}
+		__bd_forget(inode);
+		spin_unlock(&inode_lock);
 	}
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
diff -puN fs/fs-writeback.c~bdev-inode-sync fs/fs-writeback.c
--- linux-2.6.12-rc1/fs/fs-writeback.c~bdev-inode-sync	2005-03-20 02:59:24.000000000 +0900
+++ linux-2.6.12-rc1-hirofumi/fs/fs-writeback.c	2005-03-20 04:00:15.000000000 +0900
@@ -140,6 +140,25 @@ static int write_inode(struct inode *ino
 	return 0;
 }
 
+/* Called under inode_lock. */
+void wait_inode_ilock(struct inode *inode)
+{
+	wait_queue_head_t *wqh;
+	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK);
+
+	if (!(inode->i_state & I_LOCK))
+		return;
+
+	wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
+	do {
+		__iget(inode);
+		spin_unlock(&inode_lock);
+		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
+		iput(inode);
+		spin_lock(&inode_lock);
+	} while (inode->i_state & I_LOCK);
+}
+
 /*
  * Write a single inode's dirty pages and inode data out to disk.
  * If `wait' is set, wait on the writeout.
@@ -244,8 +263,6 @@ static int
 __writeback_single_inode(struct inode *inode,
 			struct writeback_control *wbc)
 {
-	wait_queue_head_t *wqh;
-
 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
 		list_move(&inode->i_list, &inode->i_sb->s_dirty);
 		return 0;
@@ -254,19 +271,8 @@ __writeback_single_inode(struct inode *i
 	/*
 	 * It's a data-integrity sync.  We must wait.
 	 */
-	if (inode->i_state & I_LOCK) {
-		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK);
+	wait_inode_ilock(inode);
 
-		wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
-		do {
-			__iget(inode);
-			spin_unlock(&inode_lock);
-			__wait_on_bit(wqh, &wq, inode_wait,
-							TASK_UNINTERRUPTIBLE);
-			iput(inode);
-			spin_lock(&inode_lock);
-		} while (inode->i_state & I_LOCK);
-	}
 	return __sync_single_inode(inode, wbc);
 }
 
_

             reply	other threads:[~2005-03-19 19:47 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-03-19 19:46 OGAWA Hirofumi [this message]
2005-03-20  6:38 ` race between __sync_single_inode() and iput()/bdev_clear_inode() Andrew Morton
2005-03-20 11:27   ` OGAWA Hirofumi
2005-03-20 11:43     ` OGAWA Hirofumi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87zmwzzaem.fsf@devron.myhome.or.jp \
    --to=hirofumi@mail.parknet.co.jp \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@parcelfarce.linux.theplanet.co.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.